In [None]:
 from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [None]:
#Open csv file.

data = pd.read_csv("/content/drive/MyDrive/DMVO-mRS/DMVO_mRS.csv", index_col='Subject_ID')
data.shape

In [None]:
#See all columns.

print(list(data.columns))

In [None]:
data = data[['Age', 'Sex', 'Race', 'Initial_Hospital', 'Antiplatelet_Use', 'Diuretic_Use', 'Current_or_Former_Smoker', 'Current_Alcohol_Use', 'Hypertension', 'Dyslipidemia', 'Diabetes', 'Heart_Disease', 'Atrial_Fibrillation', 'History_of_Malignancy', 'Prior_Stroke_or_TIA', 'HIV', 'HCV', 'Chronic_Kidney_Disease', 'Sleep_Apnea', 'PVD', 'DVT_or_PE', 'Obesity', 'Age_Related_Admission_SI', 'Admission_SI', 'Admission_BMI', 'Admission_SBP', 'Admission_DBP', 'Admission_HR', 'Admission_RR', 'Admission_SpO2', 'Admission_Sodium', 'Admission_Potassium', 'Admission_Chloride', 'Admission_Carbon_Dioxide', 'Admission_Anion_Gap', 'Admission_Glucose', 'Admission_BUN', 'Admission_Creatinine', 'Admission_BUN_to_Creatinine_Ratio', 'Admission_Calcium', 'Admission_Phosphorous', 'Admission_Magnesium', 'Admission_Total_Protein', 'Admission_Albumin', 'Admission_ALP', 'Admission_AST', 'Admission_ALT', 'Admission_Total_Bilirubin', 'Admission_Hematocrit', 'Admission_Hemoglobin', 'Admission_WBC_Count', 'Admission_Platelet_Count', 'Admission_Platelet_WBC_Count_Ratio', 'Admission_INR', 'Admission_PT', 'Admission_PTT', 'Admission_NIHSS', 'Admission_LAMS', 'Premorbid_mRS', 'Stroke_Etiology', 'Occlusion_Laterality', 'Occlusion_Site', 'Baseline_NCCT_ASPECTS', 'Hyperdense_MCA', 'rCBF_20', 'rCBF_30', 'rCBF_34', 'rCBF_38', 'Tmax_4s', 'Tmax_6s', 'Tmax_8s', 'Tmax_10s', 'Calculated_Mismatch', 'Calculated_Mismatch_Ratio', 'Hypoperfusion_Intensity_Ratio', 'CBV_34', 'CBV_38', 'CBV_42', 'CBV_Index', 'DSA_Collaterals', 'Single_Phase_CTA_Collateral_Score', 'Dynamic_CTP_mCTA_Collateral_Score', 'COVES_Score', 'Clot_Burden_Score', 'Final_Infarct_Volume_(DWI)', 'Final_Infarct_Volume_(FLAIR)', 'IV_TPA', 'Mechanical_Thrombectomy', 'Type_of_Thrombectomy', 'Number_of_Passes', 'Type_of_Anesthesia', 'Last_Known_Well_to_Door', 'Symptom_Onset_to_Door', 'Door_to_CT', 'Last_Known_Well_to_CT', 'Door_to_Needle_Time', 'Door_to_Groin_Puncture', 'Groin_Puncture_to_First_Pass_Time', 'Door_to_Recanalization', 'First_Pass_to_Recanalization', 'Groin_Puncture_to_Recanalization', 'mTICI', 'Hemorrhagic_Transformation', 'mRS_at_90_days']]

In [None]:
#Save unimputed data.

data.to_csv('/content/drive/MyDrive/DMVO-mRS/unimputed_data.csv')

In [None]:
#See all columns.

print(list(data.columns))

In [None]:
#Define numerical and categorical columns.

num_cols = ['Age', 'Age_Related_Admission_SI', 'Admission_SI', 'Admission_BMI', 'Admission_SBP', 'Admission_DBP', 'Admission_HR', 'Admission_RR', 'Admission_SpO2', 'Admission_Sodium', 'Admission_Potassium', 'Admission_Chloride', 'Admission_Carbon_Dioxide', 'Admission_Anion_Gap', 'Admission_Glucose', 'Admission_BUN', 'Admission_Creatinine', 'Admission_BUN_to_Creatinine_Ratio', 'Admission_Calcium', 'Admission_Phosphorous', 'Admission_Magnesium', 'Admission_Total_Protein', 'Admission_Albumin', 'Admission_ALP', 'Admission_AST', 'Admission_ALT', 'Admission_Total_Bilirubin', 'Admission_Hematocrit', 'Admission_Hemoglobin', 'Admission_WBC_Count', 'Admission_Platelet_Count', 'Admission_Platelet_WBC_Count_Ratio', 'Admission_INR', 'Admission_PT', 'Admission_PTT', 'rCBF_20', 'rCBF_30', 'rCBF_34', 'rCBF_38', 'Tmax_4s', 'Tmax_6s', 'Tmax_8s', 'Tmax_10s', 'Calculated_Mismatch', 'Calculated_Mismatch_Ratio', 'Hypoperfusion_Intensity_Ratio', 'CBV_34', 'CBV_38', 'CBV_42', 'CBV_Index', 'Final_Infarct_Volume_(DWI)', 'Final_Infarct_Volume_(FLAIR)', 'Last_Known_Well_to_Door', 'Symptom_Onset_to_Door', 'Door_to_CT', 'Last_Known_Well_to_CT', 'Door_to_Needle_Time', 'Door_to_Groin_Puncture', 'Groin_Puncture_to_First_Pass_Time', 'Door_to_Recanalization', 'First_Pass_to_Recanalization', 'Groin_Puncture_to_Recanalization']
print('Numerical columns: {}'.format(num_cols), '\n')

cat_cols = ['Sex', 'Race', 'Initial_Hospital', 'Antiplatelet_Use', 'Diuretic_Use', 'Current_or_Former_Smoker', 'Current_Alcohol_Use', 'Hypertension', 'Dyslipidemia', 'Diabetes', 'Heart_Disease', 'Atrial_Fibrillation', 'History_of_Malignancy', 'Prior_Stroke_or_TIA', 'HIV', 'HCV', 'Chronic_Kidney_Disease', 'Sleep_Apnea', 'PVD', 'DVT_or_PE', 'Obesity',  'Admission_NIHSS', 'Admission_LAMS', 'Premorbid_mRS', 'Stroke_Etiology', 'Occlusion_Laterality', 'Occlusion_Site', 'mTICI', 'DSA_Collaterals', 'Single_Phase_CTA_Collateral_Score', 'Dynamic_CTP_mCTA_Collateral_Score', 'COVES_Score', 'Clot_Burden_Score', 'Baseline_NCCT_ASPECTS', 'Hyperdense_MCA', 'Hemorrhagic_Transformation', 'Type_of_Thrombectomy', 'Number_of_Passes', 'Type_of_Anesthesia', 'IV_TPA', 'Mechanical_Thrombectomy', 'mRS_at_90_days']
print('Categorical columns: {}'.format(cat_cols))

In [None]:
#Remove outcomes.

num_cols_remove = []
cat_cols_remove = []

num_cols = [i for i in num_cols if i not in num_cols_remove]
cat_cols = [i for i in cat_cols if i not in cat_cols_remove]

In [None]:
#Check missing values for numerical columns.

missing_num = data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_num, '\n')

missing_num = pd.DataFrame(missing_num)

missing_num.columns = ['Value']

missing_num = missing_num[missing_num['Value'] > 0]

print('Numerical variables with missing values: ', list(missing_num.index), '\n')

print('Number of numerical variables with missing values: ', len(list(missing_num.index)), '\n')

missing_num = missing_num[missing_num['Value'] > 30]

missing_num = list(missing_num.index)

print('Excluded numerical variables: ', missing_num)

In [None]:
#Drop numerical columns with missing values over 30%.

data.drop(missing_num, axis=1, inplace=True)

In [None]:
#Define new numerical columns.

num_cols = [x for x in num_cols if x not in missing_num]

In [None]:
#Impute missing numerical values.

num_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

In [None]:
#Check missing values for categorical columns.

missing_cat = data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_cat, '\n')

missing_cat = pd.DataFrame(missing_cat)

missing_cat.columns = ['Value']

missing_cat = missing_cat[missing_cat['Value'] > 0]

print('Categorical variables with missing values: ', list(missing_cat.index), '\n')

print('Number of categorical variables with missing values: ', len(list(missing_cat.index)), '\n')

missing_cat = missing_cat[missing_cat['Value'] > 30]

missing_cat = list(missing_cat.index)

print('Excluded categorical variables: ', missing_cat)

In [None]:
#Drop categorical columns with missing values over 30%.

data.drop(missing_cat, axis=1, inplace=True)

In [None]:
#Define new categorical columns.

cat_cols = [x for x in cat_cols if x not in missing_cat]

In [None]:
#Replace missing categorical values with the mode of each column.

for col in cat_cols:
    mode_value = data[col].mode()[0]
    data[col].fillna(value=mode_value, inplace=True)

In [None]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/DMVO-mRS/imputed_data.csv')

In [None]:
#Manual label encoding.

data.loc[data['Sex'] == 'Male', 'Sex'] = 0
data.loc[data['Sex'] == 'Female', 'Sex'] = 1

data.loc[data['Race'] == 'White', 'Race'] = 0
data.loc[data['Race'] == 'Black', 'Race'] = 1
data.loc[data['Race'] == 'Asian', 'Race'] = 2
data.loc[data['Race'] == 'Other', 'Race'] = 3

data.loc[data['Initial_Hospital'] == 'Johns Hopkins', 'Initial_Hospital'] = 0
data.loc[data['Initial_Hospital'] == 'Other', 'Initial_Hospital'] = 1

data.loc[data['Antiplatelet_Use'] == 'No', 'Antiplatelet_Use'] = 0
data.loc[data['Antiplatelet_Use'] == 'Yes', 'Antiplatelet_Use'] = 1

data.loc[data['Diuretic_Use'] == 'No', 'Diuretic_Use'] = 0
data.loc[data['Diuretic_Use'] == 'Yes', 'Diuretic_Use'] = 1

data.loc[data['Current_or_Former_Smoker'] == 'No', 'Current_or_Former_Smoker'] = 0
data.loc[data['Current_or_Former_Smoker'] == 'Yes', 'Current_or_Former_Smoker'] = 1

data.loc[data['Current_Alcohol_Use'] == 'No', 'Current_Alcohol_Use'] = 0
data.loc[data['Current_Alcohol_Use'] == 'Yes', 'Current_Alcohol_Use'] = 1

data.loc[data['Hypertension'] == 'No', 'Hypertension'] = 0
data.loc[data['Hypertension'] == 'Yes', 'Hypertension'] = 1

data.loc[data['Dyslipidemia'] == 'No', 'Dyslipidemia'] = 0
data.loc[data['Dyslipidemia'] == 'Yes', 'Dyslipidemia'] = 1

data.loc[data['Diabetes'] == 'No', 'Diabetes'] = 0
data.loc[data['Diabetes'] == 'Yes', 'Diabetes'] = 1

data.loc[data['Heart_Disease'] == 'No', 'Heart_Disease'] = 0
data.loc[data['Heart_Disease'] == 'Yes', 'Heart_Disease'] = 1

data.loc[data['Atrial_Fibrillation'] == 'No', 'Atrial_Fibrillation'] = 0
data.loc[data['Atrial_Fibrillation'] == 'Yes', 'Atrial_Fibrillation'] = 1

data.loc[data['History_of_Malignancy'] == 'No', 'History_of_Malignancy'] = 0
data.loc[data['History_of_Malignancy'] == 'Yes', 'History_of_Malignancy'] = 1

data.loc[data['Prior_Stroke_or_TIA'] == 'No', 'Prior_Stroke_or_TIA'] = 0
data.loc[data['Prior_Stroke_or_TIA'] == 'Yes', 'Prior_Stroke_or_TIA'] = 1

data.loc[data['HIV'] == 'No', 'HIV'] = 0
data.loc[data['HIV'] == 'Yes', 'HIV'] = 1

data.loc[data['HCV'] == 'No', 'HCV'] = 0
data.loc[data['HCV'] == 'Yes', 'HCV'] = 1

data.loc[data['Chronic_Kidney_Disease'] == 'No', 'Chronic_Kidney_Disease'] = 0
data.loc[data['Chronic_Kidney_Disease'] == 'Yes', 'Chronic_Kidney_Disease'] = 1

data.loc[data['Sleep_Apnea'] == 'No', 'Sleep_Apnea'] = 0
data.loc[data['Sleep_Apnea'] == 'Yes', 'Sleep_Apnea'] = 1

data.loc[data['PVD'] == 'No', 'PVD'] = 0
data.loc[data['PVD'] == 'Yes', 'PVD'] = 1

data.loc[data['DVT_or_PE'] == 'No', 'DVT_or_PE'] = 0
data.loc[data['DVT_or_PE'] == 'Yes', 'DVT_or_PE'] = 1

data.loc[data['Obesity'] == 'No', 'Obesity'] = 0
data.loc[data['Obesity'] == 'Yes', 'Obesity'] = 1

data.loc[data['Stroke_Etiology'] == 'Large artery atherosclerosis', 'Stroke_Etiology'] = 0
data.loc[data['Stroke_Etiology'] == 'Cardioembolism', 'Stroke_Etiology'] = 1
data.loc[data['Stroke_Etiology'] == 'Small-vessel occlusion', 'Stroke_Etiology'] = 2
data.loc[data['Stroke_Etiology'] == 'Stroke of other determined etiology', 'Stroke_Etiology'] = 3
data.loc[data['Stroke_Etiology'] == 'Stroke of undetermined etiology', 'Stroke_Etiology'] = 4

data.loc[data['Occlusion_Laterality'] == 'Left', 'Occlusion_Laterality'] = 0
data.loc[data['Occlusion_Laterality'] == 'Right', 'Occlusion_Laterality'] = 1

data.loc[data['Occlusion_Site'] == 'ACA', 'Occlusion_Site'] = 0
data.loc[data['Occlusion_Site'] == 'MCA', 'Occlusion_Site'] = 1
data.loc[data['Occlusion_Site'] == 'PCA', 'Occlusion_Site'] = 2

data.loc[data['Hyperdense_MCA'] == 'No', 'Hyperdense_MCA'] = 0
data.loc[data['Hyperdense_MCA'] == 'Yes', 'Hyperdense_MCA'] = 1

data.loc[data['IV_TPA'] == 'No', 'IV_TPA'] = 0
data.loc[data['IV_TPA'] == 'Yes', 'IV_TPA'] = 1

data.loc[data['Mechanical_Thrombectomy'] == 'Not attempted', 'Mechanical_Thrombectomy'] = 0
data.loc[data['Mechanical_Thrombectomy'] == 'Attempted', 'Mechanical_Thrombectomy'] = 1

data.loc[data['Type_of_Thrombectomy'] == 'MT not attempted', 'Type_of_Thrombectomy'] = 0
data.loc[data['Type_of_Thrombectomy'] == 'Direct aspiration', 'Type_of_Thrombectomy'] = 1
data.loc[data['Type_of_Thrombectomy'] == 'Stent retriever', 'Type_of_Thrombectomy'] = 2
data.loc[data['Type_of_Thrombectomy'] == 'Combined', 'Type_of_Thrombectomy'] = 3

data.loc[data['Number_of_Passes'] == 'MT not attempted', 'Number_of_Passes'] = 0
data.loc[data['Number_of_Passes'] == '1', 'Number_of_Passes'] = 1
data.loc[data['Number_of_Passes'] == '2', 'Number_of_Passes'] = 2
data.loc[data['Number_of_Passes'] == '3', 'Number_of_Passes'] = 3
data.loc[data['Number_of_Passes'] == '>3', 'Number_of_Passes'] = 4

data.loc[data['Type_of_Anesthesia'] == 'MT not attempted', 'Type_of_Anesthesia'] = 0
data.loc[data['Type_of_Anesthesia'] == 'General', 'Type_of_Anesthesia'] = 1
data.loc[data['Type_of_Anesthesia'] == 'MAC', 'Type_of_Anesthesia'] = 2

data.loc[data['mTICI'] == 'MT not attempted', 'mTICI'] = 0
data.loc[data['mTICI'] == '0', 'mTICI'] = 1
data.loc[data['mTICI'] == '1', 'mTICI'] = 2
data.loc[data['mTICI'] == '2a', 'mTICI'] = 3
data.loc[data['mTICI'] == '2b', 'mTICI'] = 4
data.loc[data['mTICI'] == '2c', 'mTICI'] = 5
data.loc[data['mTICI'] == '3', 'mTICI'] = 6

data.loc[data['Hemorrhagic_Transformation'] == 'No', 'Hemorrhagic_Transformation'] = 0
data.loc[data['Hemorrhagic_Transformation'] == 'Yes', 'Hemorrhagic_Transformation'] = 1

In [None]:
#Change variable names to field names.

data_dictionary = pd.read_csv("/content/drive/MyDrive/DMVO-mRS/data_dictionary.csv", encoding = 'latin1', index_col = None, low_memory = False)
FieldNames = dict(zip(data_dictionary['ï»¿Variable'], data_dictionary['Field Name']))
data.columns = data.columns.map(FieldNames)

In [None]:
#Save final data.

data.to_csv('/content/drive/MyDrive/DMVO-mRS/final_data.csv')