In [55]:
# IMPORT PACKAGES
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler

In [56]:
# PREPROCESSING FUNCTIONS


# Helper Functions
def split_external_data(df):
    """
    df_uml = training data
    df_umg = external validation set
    note: external validation set is different from the test data from train test split
    """
    groups = df.groupby(by = 'Center')
    df_uml = groups.get_group('Leipzig')
    df_umg = groups.get_group('Greifswald')

    return df_uml, df_umg

def undersample_data(df):
    print(f'Count of each target variable before undersampling: {df['Diagnosis'].value_counts()}')

    groups = df.groupby(by = 'Diagnosis')
    sep = groups.get_group(1)
    ctrl = groups.get_group(0)

    rows = sep.shape[0]
    ctrl = ctrl.sample(n=rows, random_state=42)

    df = pd.concat([ctrl, sep], axis=0, ignore_index=True)
    df = df.copy()
    print(f'Count of each target variable after undersampling: {df['Diagnosis'].value_counts()}')

    return df

def rescale_dataset(df):
    df = df.copy()
    # print(f'Count of each target variable before rescaling: {df['Diagnosis'].value_counts()}')
    
    X_df = df.drop(columns=['Diagnosis'])
    y = df['Diagnosis']

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X_df)
    X_df = pd.DataFrame(X, columns=X_df.columns)


    # print(f'Index check before adding Diagnosis: {X_df.index.equals(y.index)}')

    X_df = X_df.reset_index(drop=True)
    y = y.reset_index(drop=True)

    X_df['Diagnosis'] = y

  
    # After scaling
    print(f'Length of X_df after scaling: {len(X_df)}')


    print(f'Count of each target variable after rescaling: {X_df['Diagnosis'].value_counts()}')

    return X_df

def remove_outliers(df):
    #remove target
    X_df = df.drop(columns=['Diagnosis'])

    # get iqr
    q1 = X_df.quantile(0.25)
    q3 = X_df.quantile(0.75)
    iqr = q3 - q1

    # get bounds for dataset
    low_bound = q1 - 1.5 * iqr
    upp_bound = q3 + 1.5 * iqr

    #get index of non outliers
    non_outlier_boolean = ((X_df >= low_bound) & (X_df <= upp_bound)).all(axis=1)
    non_outlier_index = non_outlier_boolean[non_outlier_boolean].index

    # extract non outliers
    df = df.loc[non_outlier_index]
    print(f'Count of each target variable after outlier removal: {df['Diagnosis'].value_counts()}')

    return df



def clean(df):
    df = df.copy()

    # numeric encoding
    df['Sex'] = pd.factorize(df['Sex'])[0]
    df['Diagnosis'] = pd.factorize(df['Diagnosis'])[0]

    # Remove SIRS diagnosis
    df['Diagnosis'] = df['Diagnosis'][df['Diagnosis'] !=2]
    print(f'Count of each target variable before preprop: {df['Diagnosis'].value_counts()}')

    # Drop extraneous columns
    df = df.drop(columns=['Id','Center', 'Set', 'Sender', 'Episode', 'Time','TargetIcu', 'SecToIcu','PCT', 'CRP'])

    # remove duplicates and NA
    df = df.drop_duplicates()
    df.dropna(inplace=True)
    print(f'Count of each target variable after dropping na and duplicates: {df['Diagnosis'].value_counts()}')

    
    return df


In [57]:
# Main Function
def preprocess(df, save_csv = False):
    df = df.copy()
    datasets = split_external_data(df)

    preprops = []
    for df in datasets:
        df = clean(df)
        # df = remove_outliers(df)
        df = rescale_dataset(df)
        df = undersample_data(df)
        preprops.append(df)

    df_uml = preprops[0]
    df_umg = preprops[1]
    # Save preprocessed data if desired
    if save_csv:
        df_uml.to_csv('sbcdata_preprocessed_training.csv', index=False)
        df_umg.to_csv('sbcdata_preprocessed_external_val.csv', index=False)

    preprops = tuple(preprops)
    return preprops





In [58]:
# PREPROCESS DATA
file_path = r'G:\My Drive\School\Current Classes\SSIE 548 - Healtchare Data Science\Project\Data\sbcdata.csv'
df = pd.read_csv(file_path)

#if you want the files to be saved set save_csv to true
df_uml, df_umg = preprocess(df, save_csv=False)



Count of each target variable before preprop: Diagnosis
0.0    1776380
1.0     251395
Name: count, dtype: int64
Count of each target variable after dropping na and duplicates: Diagnosis
0.0    1673822
1.0     243123
Name: count, dtype: int64
Length of X_df after scaling: 1916945
Count of each target variable after rescaling: Diagnosis
0.0    1673822
1.0     243123
Name: count, dtype: int64
Count of each target variable before undersampling: Diagnosis
0.0    1673822
1.0     243123
Name: count, dtype: int64
Count of each target variable after undersampling: Diagnosis
0.0    243123
1.0    243123
Name: count, dtype: int64
Count of each target variable before preprop: Diagnosis
0.0    526943
1.0    128578
Name: count, dtype: int64
Count of each target variable after dropping na and duplicates: Diagnosis
0.0    513133
1.0    122511
Name: count, dtype: int64
Length of X_df after scaling: 635644
Count of each target variable after rescaling: Diagnosis
0.0    513133
1.0    122511
Name: count, d

In [59]:
# confirm preprocessing worked

print('training set preprocessing confirmation')
display(df_uml.Diagnosis.value_counts()) #make sure data is balance
display(df_uml)

print('validation set preprocessing confirmation')

display(df_umg.Diagnosis.value_counts()) #make sure data is balance
display(df_umg)

training set preprocessing confirmation


Diagnosis
0.0    243123
1.0    243123
Name: count, dtype: int64

Unnamed: 0,Age,Sex,HGB,MCV,PLT,RBC,WBC,Diagnosis
0,0.613636,0.0,0.625000,0.398945,0.079317,0.528282,0.007285,0.0
1,0.511364,0.0,0.687500,0.450660,0.059962,0.512273,0.005643,0.0
2,0.545455,1.0,0.680556,0.421108,0.068311,0.548559,0.008208,0.0
3,0.181818,0.0,0.263889,0.395251,0.061480,0.223052,0.008824,0.0
4,0.352273,0.0,0.659722,0.407388,0.079317,0.543223,0.005336,0.0
...,...,...,...,...,...,...,...,...
486241,0.431818,0.0,0.409722,0.450132,0.050474,0.299893,0.010261,1.0
486242,0.431818,0.0,0.319444,0.462797,0.025427,0.230523,0.004002,1.0
486243,0.431818,0.0,0.416667,0.454354,0.041746,0.307364,0.008414,1.0
486244,0.431818,0.0,0.381944,0.462269,0.039848,0.280683,0.006875,1.0


validation set preprocessing confirmation


Diagnosis
0.0    122511
1.0    122511
Name: count, dtype: int64

Unnamed: 0,Age,Sex,HGB,MCV,PLT,RBC,WBC,Diagnosis
0,0.790698,1.0,0.475524,0.580777,0.384019,0.409639,0.022263,0.0
1,0.488372,0.0,0.300699,0.420245,0.041145,0.301205,0.020779,0.0
2,0.430233,0.0,0.559441,0.400818,0.070662,0.518072,0.012152,0.0
3,0.569767,0.0,0.244755,0.494888,0.064997,0.192771,0.005770,0.0
4,0.511628,1.0,0.279720,0.239264,0.150566,0.349398,0.019109,0.0
...,...,...,...,...,...,...,...,...
245017,0.430233,1.0,0.293706,0.464213,0.019082,0.253012,0.125046,1.0
245018,0.430233,1.0,0.307692,0.481595,0.025343,0.265060,0.151948,1.0
245019,0.430233,1.0,0.300699,0.470348,0.034884,0.277108,0.141929,1.0
245020,0.430233,1.0,0.272727,0.543967,0.031306,0.240964,0.117254,1.0
