In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# ---------------------------------------------------------------------
# 1. Paths and constants
# ---------------------------------------------------------------------
DATA_DIR = Path("/home/mpradhan/Intern_Research_Project/data")
master_file = DATA_DIR / "X_master_dense_prob.csv"
label_file = DATA_DIR / "y_master.csv"

In [3]:

# Output files
X_out = DATA_DIR / "X_ran_pos_low_unlab.csv"
y_true_out = DATA_DIR / "y_ran_pos_low_unlab_true.csv"
y_pu_out = DATA_DIR / "y_ran_pos_low_unlab_flipped.csv"

In [None]:
# How many to select
N_POS = 350000
N_LOW_NEG = 350_000
N_FLIP = 40_000


In [5]:
# ---------------------------------------------------------------------
# 2. Load and drop NaNs
# ---------------------------------------------------------------------
X = pd.read_csv(master_file)
y = pd.read_csv(label_file).squeeze("columns")

In [6]:
# Drop rows with NaNs in X and keep same rows in y
non_nan_mask = X.notna().all(axis=1)
X = X[non_nan_mask].reset_index(drop=True)
y = y[non_nan_mask].reset_index(drop=True)

assert len(X) == len(y), "Lengths differ after dropping NaNs"

df = X.copy()
df["true_label"] = y

print(f"Data shape after dropping NaNs: {df.shape}")

Data shape after dropping NaNs: (11179128, 130)


In [7]:
# ---------------------------------------------------------------------
# 3. Get random positives and lowest-probability negatives
# ---------------------------------------------------------------------
positives = df[df["true_label"] == 1].copy()
negatives = df[df["true_label"] == 0].copy()

# Random sample of positives
random_positives = positives.sample(n=N_POS, random_state=42)

# Lowest probability negatives
low_negatives = negatives.sort_values("xgb_pos_prob", ascending=True).head(N_LOW_NEG)

print(f"Random positives: {len(random_positives):,}")
print(f"Low negatives: {len(low_negatives):,}")

Random positives: 250,000
Low negatives: 250,000


In [8]:
# ---------------------------------------------------------------------
# 4. Flip some positives to unlabeled
# ---------------------------------------------------------------------
flipped_idx = random_positives.sample(n=N_FLIP, random_state=42).index

random_positives["pu_label"] = 1
random_positives.loc[flipped_idx, "pu_label"] = 0

low_negatives["pu_label"] = 0

In [9]:
# ---------------------------------------------------------------------
# 5. Combine and shuffle
# ---------------------------------------------------------------------
new_df = pd.concat([random_positives, low_negatives]).sample(frac=1, random_state=42).reset_index(drop=True)

X_new = new_df.drop(columns=["true_label", "pu_label"])
y_true = new_df["true_label"]
y_pu = new_df["pu_label"]

In [10]:
# ---------------------------------------------------------------------
# 6. Save
# ---------------------------------------------------------------------
X_new.to_csv(X_out, index=False)
y_true.to_csv(y_true_out, index=False)
y_pu.to_csv(y_pu_out, index=False)

print(f"Saved:")
print(f"  {X_out}")
print(f"  {y_true_out}")
print(f"  {y_pu_out}")

print(f"Shape: {X_new.shape}, True Positives: {y_true.sum()}, PU Positives: {y_pu.sum()}")

Saved:
  /home/mpradhan/Intern_Research_Project/data/X_ran_pos_low_unlab.csv
  /home/mpradhan/Intern_Research_Project/data/y_ran_pos_low_unlab_true.csv
  /home/mpradhan/Intern_Research_Project/data/y_ran_pos_low_unlab_flipped.csv
Shape: (500000, 129), True Positives: 250000, PU Positives: 230000
