In [1]:
print("Hello, World!")

Hello, World!


In [4]:
import pandas as pd

In [19]:
def build_labels_from_manifests(LUSC_MANIFEST_PATH: str, LUAD_MANIFEST_PATH: str, file_extension: str = ".svs"):

    
    # read manifests
    lusc = pd.read_csv(LUSC_MANIFEST_PATH, sep='\t')
    luad = pd.read_csv(LUAD_MANIFEST_PATH, sep='\t')

    print("Columns in LUSC: ", lusc.columns)

    lusc = lusc.assign(project="LUSC")
    luad = luad.assign(project="LUAD")

    lusc = lusc.assign(label=0)
    luad = luad.assign(label=1)

    lusc_luad = pd.concat([lusc, luad])

    lusc_luad = lusc_luad.assign(slide_id = lusc_luad["filename"].str.replace(file_extension, ""))
    lusc_luad = lusc_luad.assign(case_id = lusc_luad["filename"].apply(lambda x: '-'.join(x.split('-')[:3])))

    final_columns = ["slide_id", "project", "label", "case_id"]

    lusc_luad = lusc_luad[final_columns]
    
    return lusc_luad
    


In [16]:
LUSC_MANIFEST_PATH = "manifests/manifest_LUSC.txt"
LUAD_MANIFEST_PATH = "manifests/manifest_LUAD.txt"

final_df = build_labels_from_manifests(LUSC_MANIFEST_PATH, LUAD_MANIFEST_PATH)


Columns in LUSC:  Index(['id', 'filename', 'md5', 'size', 'state'], dtype='object')


In [17]:
final_df.to_csv("training_labels.csv", index=False)

In [21]:
final_df.columns

Index(['slide_id', 'project', 'label', 'filename', 'case_id'], dtype='object')

In [None]:
# Requires: pandas, sklearn
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold

# df must have columns: 'slide_id', 'patient_id', 'label' (int or str)
# Example: df = pd.read_csv("slides.csv")

RANDOM_SEED = 42

# 1) quick check
counts = df.groupby('case_id').size().value_counts().sort_index()
print("slides per patient distribution:\n", counts)

# 2) create 70/15/15 patient-wise split
gss = GroupShuffleSplit(n_splits=1, test_size=0.30, random_state=RANDOM_SEED)
train_idx, temp_idx = next(gss.split(df, groups=df['case_id']))
df_train = df.iloc[train_idx].copy()
df_temp  = df.iloc[temp_idx].copy()

# split temp into val/test (half-half -> 15% each)
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=RANDOM_SEED)
val_idx_rel, test_idx_rel = next(gss2.split(df_temp, groups=df_temp['case_id']))
df_val  = df_temp.iloc[val_idx_rel].copy()
df_test = df_temp.iloc[test_idx_rel].copy()

# sanity: ensure patient sets are disjoint
assert set(df_train['case_id']).isdisjoint(df_val['case_id'])
assert set(df_train['case_id']).isdisjoint(df_test['case_id'])
assert set(df_val['case_id']).isdisjoint(df_test['case_id'])

print("splits (patients):", df_train['case_id'].nunique(), df_val['case_id'].nunique(), df_test['case_id'].nunique())
print("label distribution train/val/test:")
print(df_train['label'].value_counts(), df_val['label'].value_counts(), df_test['label'].value_counts())

slides per patient distribution:
 1     925
2      12
3       3
4       5
5       3
6       2
7       3
8       1
9       1
10      1
Name: count, dtype: int64
splits (patients): 669 143 144
label distribution train/val/test:
label
1    371
0    357
Name: count, dtype: int64 label
0    89
1    83
Name: count, dtype: int64 label
1    87
0    66
Name: count, dtype: int64


In [30]:
# 3) patient-wise 5-fold CV (StratifiedGroupKFold)
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
# iterate over folds (gives indices into df or into arrays)
for fold, (tr_idx, val_idx) in enumerate(sgkf.split(df, df['label'], groups=df['case_id'])):
    print(f"fold {fold}: train patients {df.iloc[tr_idx]['case_id'].nunique()}, val patients {df.iloc[val_idx]['case_id'].nunique()}")
    # use df.iloc[tr_idx], df.iloc[val_idx] for training/validation in that fold

fold 0: train patients 764, val patients 192
fold 1: train patients 764, val patients 192
fold 2: train patients 764, val patients 192
fold 3: train patients 766, val patients 190
fold 4: train patients 766, val patients 190
