<a href="https://colab.research.google.com/github/pitthexai/ICHI2023_EBAIC/blob/main/Code/1_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import numpy as np
import pandas as pd

import os
from zipfile import ZipFile

from sklearn.model_selection import train_test_split

In [3]:
BONY_ANATOMY = "knee"

In [4]:
# Directories for
if BONY_ANATOMY == "knee":
    filename = "knee_seg_sample.csv"
    directory = "KneeSample"
    zipfile = "knee_sample.zip"
else:
    filename = "hip_seg_sample.csv"
    directory = "HipSample"
    zipfile = "hip_sample.zip"

zipfile_loc = f"/content/drive/MyDrive/GoogleColabProjects/EBAIC_2023/EBAIC2023_HipKnee_Datasets/{zipfile}"
data_location = f"/content/data/{directory}/{filename}"
data_save_location = f"/content/drive/MyDrive/GoogleColabProjects/EBAIC_2023/EBAIC2023_HipKnee_Datasets/"

In [5]:
if not os.path.exists(data_save_location):
    with ZipFile(zipfile_loc, 'r') as zipf:
        zipf.extractall("/content/")

In [6]:
def generate_train_test_split(csv_pth, filter_query=None):
    data_records = pd.read_csv(csv_pth)
    data_records = data_records[data_records.id != 9025994].reset_index(drop=True)
    if filter_query:
        data_records = data_records.query(filter_query)

    train, test = train_test_split(data_records.id.unique(), test_size=0.3, random_state=42)
    valid, test = train_test_split(test, test_size=0.5, random_state=42)

    train = data_records[data_records.id.isin(train)].reset_index(drop=True)
    valid = data_records[data_records.id.isin(valid)].reset_index(drop=True)
    test = data_records[data_records.id.isin(test)].reset_index(drop=True)

    return train, valid, test

In [7]:
def balance_dataset(data, by_filter1, by_filter2):
    filtered1 = data.query(by_filter1)
    filtered2 = data.query(by_filter2)

    min_sample_size = np.minimum(len(filtered1), len(filtered2))
    samp1 = filtered1.sample(min_sample_size,  random_state = 42)
    samp2 = filtered2.sample(min_sample_size,  random_state = 42)

    balanced_data = pd.concat([samp1, samp2])

    print(f"Training dataset reduced from size of {len(data)} samples to a balanced dataset of size {len(balanced_data)} samples")

    return balanced_data

In [None]:
## Baseline Datasets
train_all, valid_all, test_all = generate_train_test_split(data_location)
train_white, valid_white, test_white = generate_train_test_split(data_location, filter_query="P02RACE == '1: White or Caucasian'")
train_black, valid_black, test_black = generate_train_test_split(data_location, filter_query="P02RACE == '2: Black or African American'")
train_male, valid_male, test_male = generate_train_test_split(data_location, filter_query="P02SEX == '1: Male'")
train_female, valid_female, test_female = generate_train_test_split(data_location, filter_query="P02SEX == '2: Female'")

balanced_gender_train = balance_dataset(train_all, "P02SEX == '1: Male'", "P02SEX == '2: Female'")
balanced_race_train = balance_dataset(train_all, "P02RACE == '1: White or Caucasian'", "P02RACE == '2: Black or African American'")

In [9]:
# Save Datasets
if BONY_ANATOMY == "knee":
    train_all.to_csv(os.path.join(data_save_location, "knee_train_all.csv"), index=False)
    valid_all.to_csv(os.path.join(data_save_location, "knee_valid_all.csv"), index=False)
    test_all.to_csv(os.path.join(data_save_location, "knee_test_all.csv"), index=False)

    train_white.to_csv(os.path.join(data_save_location, "knee_train_race-white.csv"), index=False)
    valid_white.to_csv(os.path.join(data_save_location, "knee_valid_race-white.csv"), index=False)
    test_white.to_csv(os.path.join(data_save_location, "knee_test_race-white.csv"), index=False)

    train_black.to_csv(os.path.join(data_save_location, "knee_train_race-black.csv"), index=False)
    valid_black.to_csv(os.path.join(data_save_location, "knee_valid_race-black.csv"), index=False)
    test_black.to_csv(os.path.join(data_save_location, "knee_test_race-black.csv"), index=False)

    train_male.to_csv(os.path.join(data_save_location, "knee_train_gender-male.csv"), index=False)
    valid_male.to_csv(os.path.join(data_save_location, "knee_valid_gender-male.csv"), index=False)
    test_male.to_csv(os.path.join(data_save_location, "knee_test_gender-male.csv"), index=False)

    train_female.to_csv(os.path.join(data_save_location, "knee_train_gender-female.csv"), index=False)
    valid_female.to_csv(os.path.join(data_save_location, "knee_valid_gender-female.csv"), index=False)
    test_female.to_csv(os.path.join(data_save_location, "knee_test_gender-female.csv"), index=False)

    balanced_gender_train.to_csv(os.path.join(data_save_location, "knee_train_balanced_gender.csv"), index=False)
    balanced_race_train.to_csv(os.path.join(data_save_location, "knee_train_balanced_race.csv"), index=False)
else:
    train_all.to_csv(os.path.join(data_save_location, "hip_train_all.csv"), index=False)
    valid_all.to_csv(os.path.join(data_save_location, "hip_valid_all.csv"), index=False)
    test_all.to_csv(os.path.join(data_save_location, "hip_test_all.csv"), index=False)

    train_white.to_csv(os.path.join(data_save_location, "hip_train_race-white.csv"), index=False)
    valid_white.to_csv(os.path.join(data_save_location, "hip_valid_race-white.csv"), index=False)
    test_white.to_csv(os.path.join(data_save_location, "hip_test_race-white.csv"), index=False)

    train_black.to_csv(os.path.join(data_save_location, "hip_train_race-black.csv"), index=False)
    valid_black.to_csv(os.path.join(data_save_location, "hip_valid_race-black.csv"), index=False)
    test_black.to_csv(os.path.join(data_save_location, "hip_test_race-black.csv"), index=False)

    train_male.to_csv(os.path.join(data_save_location, "hip_train_gender-male.csv"), index=False)
    valid_male.to_csv(os.path.join(data_save_location, "hip_valid_gender-male.csv"), index=False)
    test_male.to_csv(os.path.join(data_save_location, "hip_test_gender-male.csv"), index=False)

    train_female.to_csv(os.path.join(data_save_location, "hip_train_gender-female.csv"), index=False)
    valid_female.to_csv(os.path.join(data_save_location, "hip_valid_gender-female.csv"), index=False)
    test_female.to_csv(os.path.join(data_save_location, "hip_test_gender-female.csv"), index=False)

    balanced_gender_train.to_csv(os.path.join(data_save_location, "hip_train_balanced_gender.csv"), index=False)
    balanced_race_train.to_csv(os.path.join(data_save_location, "hip_train_balanced_race.csv"), index=False)