# Preprocessing

In [None]:
import pandas as pd
from os.path import join
from src.configs.config import input_cols, intermediate_col_dict, tcu_col, le_label, re_label, num_input_cols

from src.configs.config import fp_checkpoint_folder, fp_data_folder # Edit this
fp_data_file = join(fp_data_folder, "Time_Processed_eye_sample_time.csv")
fp_processed_data = join(fp_data_folder, "eye_data.csv")

def display_df(df):
    display(df.head())
    print(f"Shape: {df.shape}")

## Load Data

In [2]:
df = pd.read_csv(fp_data_file)
display(df)

Unnamed: 0,Vision Test RE,VA RE,Variance RE,Pinhole VA RE,Pinhole Variance RE,Vision Test LE,VA LE,Variance LE,Pinhole VA LE,Pinhole Variance LE,...,GSC_RE_Time,GSC_LE_Time,ERM_RE_Time,ERM_LE_Time,DRC_SUM,DMC_SUM,AMDC_SUM,GSF_SUM,ERM_SUM,Total
0,1.0,0.2,2.0,,,1.0,0.1,-1.0,,,...,52.0,52.0,52.0,52.0,52.0,,52.0,52.0,52.0,52.0
1,1.0,0.0,,,,1.0,0.0,,,,...,52.0,52.0,52.0,52.0,52.0,26.0,52.0,52.0,52.0,26.0
2,0.0,0.0,-2.0,,,0.0,0.3,1.0,0.1,-2.0,...,52.0,52.0,52.0,52.0,52.0,26.0,52.0,52.0,52.0,26.0
3,0.0,0.0,,,,0.0,0.0,,,,...,52.0,52.0,52.0,52.0,52.0,26.0,52.0,52.0,52.0,26.0
4,1.0,0.1,-2.0,,,1.0,0.2,-2.0,,,...,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,0.0,0.1,1.0,,,0.0,0.0,-1.0,,,...,52.0,52.0,52.0,52.0,52.0,26.0,52.0,52.0,52.0,26.0
864,0.0,0.0,,,,0.0,0.0,-2.0,,,...,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0
865,1.0,0.2,1.0,,,1.0,0.1,2.0,,,...,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0
866,0.0,0.0,-1.0,,,1.0,0.3,,0.2,,...,52.0,52.0,52.0,52.0,13.0,26.0,52.0,52.0,52.0,13.0


## Standardize Column Names

In [3]:
def standardise_column_names(df):
    special_col = None
    columns = df.columns.to_list()
    new_columns = []
    for col in columns:
        if col == "Total":
            col = "Total_Time"
        if ((col[-3:] == " RE") or (col[-3:] == " LE")):
            pass
        # Convert "DRC_RE_NoDR" to "DRC_NoDR RE"
        elif ("_RE_" in col):
            col = col.replace("_RE_", "_")
            col = col + " RE"
        elif ("_LE_" in col):
            col = col.replace("_LE_", "_")
            col = col + " LE"
        elif (" LE_RE:" in col):
            col = col.replace(" LE_RE:", ":")
            special_col = col
            col = col + " RE"
        elif (" RE_" in col):
            col = col.replace(" RE_", "_")
            col = col + " RE"
        elif (" LE_" in col):
            col = col.replace(" LE_", "_")
            col = col + " LE"
        new_columns.append(col)
    df.columns = new_columns
    df[special_col+" LE"] = df[special_col+" RE"] # For column HVF LE_RE: flat, no IRF/SRF.
    # For GSC 
    df["GSC_G0 RE"] = (df["Glaucoma Suspect Condition RE"] == 0).astype('int32')
    df["GSC_G1 RE"] = (df["Glaucoma Suspect Condition RE"] == 1).astype('int32')
    df["GSC_NA RE"] = ((df["Glaucoma Suspect Condition RE"] == -1) | (df["Glaucoma Suspect Condition RE"].isna())).astype('int32')
    df["GSC_G0 LE"] = (df["Glaucoma Suspect Condition LE"] == 0).astype('int32')
    df["GSC_G1 LE"] = (df["Glaucoma Suspect Condition LE"] == 1).astype('int32')
    df["GSC_NA LE"] = ((df["Glaucoma Suspect Condition LE"] == -1) | (df["Glaucoma Suspect Condition LE"].isna())).astype('int32')

    df["DMC_NA LE"]=(~df[[col for col in df.columns if (("DMC" in col) and ("LE" in col) and ("Time" not in col))]].any(axis=1)).astype('int32')
    df["DMC_NA RE"]=(~df[[col for col in df.columns if (("DMC" in col) and ("RE" in col) and ("Time" not in col))]].any(axis=1)).astype('int32')

    df["AMDC_NA LE"]=(~df[[col for col in df.columns if (("AMDC" in col) and ("LE" in col) and ("Time" not in col))]].any(axis=1)).astype('int32')
    df["AMDC_NA RE"]=(~df[[col for col in df.columns if (("AMDC" in col) and ("RE" in col) and ("Time" not in col))]].any(axis=1)).astype('int32')

    df["DRC_NA LE"]=(~df[[col for col in df.columns if (("DRC" in col) and ("LE" in col) and ("Time" not in col))]].any(axis=1)).astype('int32')
    df["DRC_NA RE"]=(~df[[col for col in df.columns if (("DRC" in col) and ("RE" in col) and ("Time" not in col))]].any(axis=1)).astype('int32')
    
    return df
df = standardise_column_names(df)
print(df.columns.tolist())

['Vision Test RE', 'VA RE', 'Variance RE', 'Pinhole VA RE', 'Pinhole Variance RE', 'Vision Test LE', 'VA LE', 'Variance LE', 'Pinhole VA LE', 'Pinhole Variance LE', 'IOP RE', 'IOP LE', 'Gradable RE', 'Gradable LE', 'Diabetic Maculopathy Condition RE', 'Diabetic Maculopathy Condition LE', 'Cup Disc Ratio RE', 'Cup Disc Ratio LE', 'Glaucoma Suspect Condition RE', 'Glaucoma Suspect Condition LE', 'Epiretinal Membrane (ERM) RE Progressing', 'Epiretinal Membrane (ERM) LE Progressing', 'Vein Occlusion RE', 'Vein Occlusion LE', 'CRVO/BRVO Diagnosis RE', 'CRVO/BRVO Diagnosis \nLE', 'Plaquenil Toxicity RE', 'Plaquenil Toxicity LE', 'Escalation to optoms from EDS / PSA.\n_IOP ((Airpuff) ? 22 mmHg)', 'Escalation to optoms from EDS / PSA.\n_NEW flashes and/or floaters (within 1/12)', 'Escalation to optoms from EDS / PSA.\n_Significant Symptoms (Free Text)', 'Escalation to optoms from EDS / PSA.\n_Ungradable images', 'Escalation to optoms from EDS / PSA.\n_Worsening of VA by 2 or more lines', 'Esca

## Check that all required columns are present

In [4]:
def check_cols(df, input_cols, intermediate_col_dict, tcu_col, le_label, re_label):
    columns = set(df.columns.tolist())
    # Check all input cols are present
    left_input_cols = {col+" "+le_label for col in input_cols}
    right_input_cols = {col+" "+re_label for col in input_cols}
    lr_input_cols = left_input_cols.union(right_input_cols)
    # print(lr_input_cols.difference(columns))
    assert lr_input_cols.issubset(columns)

    # Check all intermediate columns are present
    left_int_cols = {key +"_"+col+" "+le_label for key, val in intermediate_col_dict.items() for col in val}
    right_int_cols = {key +"_"+col+" "+re_label for key, val in intermediate_col_dict.items() for col in val}
    lr_int_cols = left_int_cols.union(right_int_cols)
    # print(lr_int_cols.difference(columns))
    assert lr_int_cols.issubset(columns)

    # Check that TCU col is present
    assert tcu_col in columns
    
check_cols(df, input_cols, intermediate_col_dict, tcu_col, le_label, re_label)

## Remove Redundant Columns

In [5]:
def keep_relevant_columns(df, input_cols, intermediate_col_dict, tcu_col, le_label, re_label):
    left_input_cols = [col+" "+le_label for col in input_cols]
    right_input_cols = [col+" "+re_label for col in input_cols]
    lr_input_cols = left_input_cols + right_input_cols

    left_int_cols = [key +"_"+col+" "+le_label for key, val in intermediate_col_dict.items() for col in val]
    right_int_cols = [key +"_"+col+" "+re_label for key, val in intermediate_col_dict.items() for col in val]
    lr_int_cols = left_int_cols + right_int_cols

    impt_cols = lr_input_cols+lr_int_cols+[tcu_col]

    return df[impt_cols]
processed_df = keep_relevant_columns(df, input_cols, intermediate_col_dict, tcu_col, le_label, re_label)
display_df(processed_df)

Unnamed: 0,Vision Test LE,VA LE,IOP LE,Gradable LE,Cup Disc Ratio LE,OCT RNFL_Abnormal (Progressing) LE,OCT RNFL_Abnormal (Stable) LE,OCT RNFL_Normal LE,OCT RNFL_Unreliable LE,OCT RNFL_nan LE,...,DRC_MildNPDR RE,DRC_ModerateNPDR RE,DRC_SevereNPDR RE,DRC_ProliferativeDR RE,DRC_Unreadable RE,DRC_NA RE,GSC_G0 RE,GSC_G1 RE,GSC_NA RE,Total_Time
0,1.0,0.1,13.0,0.0,0.3,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,52.0
1,1.0,0.0,18.0,1.0,0.4,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,26.0
2,0.0,0.3,12.0,1.0,0.4,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,26.0
3,0.0,0.0,17.0,1.0,0.4,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,26.0
4,1.0,0.2,18.0,1.0,0.2,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,52.0


Shape: (868, 165)


In [6]:
(processed_df.isna().sum().sort_values(ascending=False)/len(processed_df)).head(11)

Gradable LE          0.017281
Gradable RE          0.017281
Total_Time           0.016129
Cup Disc Ratio LE    0.016129
Cup Disc Ratio RE    0.016129
VA RE                0.014977
VA LE                0.013825
IOP RE               0.013825
Vision Test LE       0.013825
IOP LE               0.013825
Vision Test RE       0.012673
dtype: float64

In [7]:
processed_df = processed_df.dropna()

In [8]:
processed_df.to_csv(fp_processed_data)