In [1]:
import pandas as pd
from pathlib import Path


In [2]:
# ---------------------------------------------------------------------
# 1. Paths and constants
# ---------------------------------------------------------------------
DATA_DIR = Path("/home/mpradhan/Intern_Research_Project/data")
master_file = DATA_DIR / "X_master_dense_prob.csv"
label_file = DATA_DIR / "y_master.csv"

In [3]:
# Output files
X_out = DATA_DIR / "X_ran_neg_hi_pos.csv"
y_true_out = DATA_DIR / "y_ran_neg_hi_pos_true.csv"
y_pu_out = DATA_DIR / "y_ran_neg_hi_pos_flipped.csv"

In [None]:
# How many to select
N_HIGH_POS = 350_000
N_NEG = 350_000
N_FLIP = 40_000

In [5]:

# ---------------------------------------------------------------------
# 2. Load and drop NaNs
# ---------------------------------------------------------------------
X = pd.read_csv(master_file)
y = pd.read_csv(label_file).squeeze("columns")

In [6]:
# Drop rows with NaNs in X and keep same rows in y
non_nan_mask = X.notna().all(axis=1)
X = X[non_nan_mask].reset_index(drop=True)
y = y[non_nan_mask].reset_index(drop=True)

assert len(X) == len(y), "Lengths differ after dropping NaNs"

df = X.copy()
df["true_label"] = y

print(f"Data shape after dropping NaNs: {df.shape}")

Data shape after dropping NaNs: (11179128, 130)


In [7]:
# 3. Get top positives and random negatives
# ---------------------------------------------------------------------
positives = df[df["true_label"] == 1].copy()
negatives = df[df["true_label"] == 0].copy()

# Sort positives by highest prob, pick top N
top_positives = positives.sort_values("xgb_pos_prob", ascending=False).head(N_HIGH_POS)

# Randomly sample negatives
negatives_sample = negatives.sample(n=N_NEG, random_state=42)

print(f"Top positives: {len(top_positives):,}")
print(f"Random negatives: {len(negatives_sample):,}")

Top positives: 250,000
Random negatives: 250,000


In [8]:
# ---------------------------------------------------------------------
# 4. Flip some positives to unlabeled
# ---------------------------------------------------------------------
flipped_idx = top_positives.sample(n=N_FLIP, random_state=42).index

top_positives["pu_label"] = 1
top_positives.loc[flipped_idx, "pu_label"] = 0

negatives_sample["pu_label"] = 0

In [9]:
# ---------------------------------------------------------------------
# 5. Combine and shuffle
# ---------------------------------------------------------------------
new_df = pd.concat([top_positives, negatives_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

X_new = new_df.drop(columns=["true_label", "pu_label"])
y_true = new_df["true_label"]
y_pu = new_df["pu_label"]

In [10]:
# ---------------------------------------------------------------------
# 6. Save
# ---------------------------------------------------------------------
X_new.to_csv(X_out, index=False)
y_true.to_csv(y_true_out, index=False)
y_pu.to_csv(y_pu_out, index=False)

print(f"Saved:")
print(f"  {X_out}")
print(f"  {y_true_out}")
print(f"  {y_pu_out}")

print(f"Shape: {X_new.shape}, True Positives: {y_true.sum()}, PU Positives: {y_pu.sum()}")

Saved:
  /home/mpradhan/Intern_Research_Project/data/X_ran_neg_hi_pos.csv
  /home/mpradhan/Intern_Research_Project/data/y_ran_neg_hi_pos_true.csv
  /home/mpradhan/Intern_Research_Project/data/y_ran_neg_hi_pos_flipped.csv
Shape: (500000, 129), True Positives: 250000, PU Positives: 230000
