In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Initial Data Processing

In [None]:
# Read the input CSV file
input_file_path = './Data_Entry_2017_v2020.csv'
df = pd.read_csv(input_file_path)
df

In [None]:
# Split 'Finding Labels' column into a list based on '|'
df['Finding Labels'] = df['Finding Labels'].str.split('|')

In [None]:
# Get all unique disease categories
categories = sorted(set(label for sublist in df['Finding Labels'] for label in sublist))

# One-hot encode all disease categories
for category in categories:
    df[category] = df['Finding Labels'].apply(lambda x: 1 if category in x else 0)

In [None]:
# Extract rows with exactly one disease
single_disease_df = df[df['Finding Labels'].apply(len) == 1]
# single_disease_df.to_csv('./single_disease.csv', index=False)

In [None]:
# Define the 6 specific diseases for the 6x200 dataset
specific_diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Pneumothorax']

# Filter to rows where only one of the specific diseases is present
specific_disease_df = single_disease_df[single_disease_df['Finding Labels'].apply(lambda x: x[0] in specific_diseases)]
specific_disease_df

In [None]:
columns_to_keep = ['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position'] + specific_diseases
specific_disease_df = specific_disease_df[columns_to_keep]
# specific_disease_df.to_csv('./specific_diseases.csv', index=False)
specific_disease_df

### Balanced 6x200 Dataset

In [None]:
# Instead of directly assigning the slice, create a copy
df = specific_disease_df.copy()

# Define age groups and genders
age_bins = [0, 60, 100]
age_labels = ['0-60', '61-100']
df['Age Group'] = pd.cut(df['Patient Age'], bins=age_bins, labels=age_labels, right=False)
genders = ['M', 'F']

# Initialize an empty DataFrame for the balanced dataset
balanced_df = pd.DataFrame()

# Target 50 samples per combination (2 age groups * 2 genders * 50 = 200 per disease)
target_samples_per_group = 50

In [None]:
for disease in specific_diseases:
    for age_group in age_labels:
        for gender in genders:
            # Filter data for this combination
            group_df = df[(df['Age Group'] == age_group) & 
                          (df['Patient Gender'] == gender) & 
                          (df[disease] == 1)].copy()
            
            # Add Patient_ID column
            group_df['Patient_ID'] = group_df['Image Index'].apply(lambda x: x.split('_')[0])
            unique_patients_df = group_df.drop_duplicates(subset='Patient_ID')
            
            # Sample exactly 50 unique patients (with replacement if needed)
            if len(unique_patients_df) >= target_samples_per_group:
                selected_df = unique_patients_df.sample(n=target_samples_per_group, random_state=42)
            else:
                selected_df = unique_patients_df.sample(n=target_samples_per_group, replace=True, random_state=42)
            
            balanced_df = pd.concat([balanced_df, selected_df])
            
balanced_df = balanced_df[balanced_df['Finding Labels'].apply(len) == 1].copy()
balanced_df['Finding Labels'] = balanced_df['Finding Labels'].str[0]

balanced_df

### Split into train/val/test (70%/10%/20% per group)

In [None]:
train_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()

groups = balanced_df.groupby(['Age Group', 'Patient Gender', 'Finding Labels'])
for _, group in groups:
    train_tmp, val_test_tmp = train_test_split(group, test_size=15, random_state=42)  # 50 - 35 = 15
    val_tmp, test_tmp = train_test_split(val_test_tmp, test_size=10, random_state=42)  # 15 into 5 val, 10 test
    train_df = pd.concat([train_df, train_tmp])
    val_df = pd.concat([val_df, val_tmp])
    test_df = pd.concat([test_df, test_tmp])

In [None]:
columns_needed = ['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position'] + specific_diseases
balanced_df = balanced_df[columns_needed]
train_df = train_df[columns_needed]
val_df = val_df[columns_needed]
test_df = test_df[columns_needed]

In [None]:
# Save the balanced dataset
balanced_df.to_csv('./nih6x200.csv', index=False)
print(f"Balanced 6x200 dataset saved to './nih6x200.csv' with {len(balanced_df)} samples")
balanced_df

In [None]:
# Save the splits
train_df.to_csv('./nih6x200_train.csv', index=False)
val_df.to_csv('./nih6x200_val.csv', index=False)
test_df.to_csv('./nih6x200_test.csv', index=False)
print(f"Balanced dataset split: Train ({len(train_df)}), Val ({len(val_df)}), Test ({len(test_df)})")
test_df