In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [None]:
#Open csv file.

data = pd.read_csv("./DMVO-mRS/DMVO-mRS.csv", index_col=0, encoding = 'latin1', low_memory = False)
data.head()

In [None]:
#See all columns.
print(list(data.columns))

In [None]:
#Check data shape.

data.shape

In [None]:
#Define variables of interest (predictor variables, inclusion/exclusion criteria, outcomes of interest).

variables = ['Age', 'Sex', 'Race', 'Smoking', 'Alcohol', 'Hypertension ', 'Dyslipidemia', 'Diabetes', 'Coronary Artery Disease', 'A-Fib', 'History of Malignancy', 'Antiplatelet or Anticoagulation', 'Prior stroke/TIA ', 'HIV', 'HCV', 'Chronic Kidney disease', 'Sleep apnea', 'PVD', 'DVT/PE', 'Age related Admission SI', 'Admission SI ', 'Admission BMI', 'Admission SBP', 'Admission DBP', 'Admission HR', 'Admission RR', 'Admission SpO2', 'Admission Sodium', 'Admission Potassium', 'Admission Glucose ', 'Admission Calcium', 'Admission BUN', 'Admission Creatinine', 'Admission BUN:Creatinine Ratio', 'Admission Hematocrit ', 'Admission Hemoglobin', 'Admission WBC Count', 'Admission Platelet Count', 'Admission Platelet:WBC Ratio', 'Admission PT', 'Admission INR', 'Admission PTT', 'Admission Troponin I', 'Admission NIHSS', 'Admission LAMS', 'Occlusion Laterality', 'Occlusion Site', 'Occlusion Proximity', 'Superior vs Inferior', 'Occlusion Segment', 'Baseline NCCT ASPECTS', 'Hyperdense MCA', 'Mechanical Thrombectomy', 'Number of Passes', 'mTICI Score', 'Door to CT (minutes)', 'Door to Needle (minutes)', 'Door to Groin Puncture (minutes)', 'Groin puncture to recanalization (minutes)', 'Hemorrhagic Transformation', 'Type of HT', 'Discharge Sodium', 'Discharge Potassium', 'Discharge Glucose ', 'Discharge Calcium', 'Discharge BUN', 'Discharge Creatinine', 'Discharge BUN:Creatinine Ratio', 'Discharge Hematocrit ', 'Discharge Hemoglobin', 'Discharge WBC Count (x 1000)', 'Discharge Platelet Count (x 1000)', 'Discharge Platelets:WBC Ratio', 'Discharge PT', 'Discharge INR', 'Discharge PTT', 'Discharge Troponin', 'Discharge BMI', 'Discharge SBP', 'Discharge DBP', 'Discharge HR', 'Discharge RR', 'Discharge SpO2', '90-day Modified Rankin Score']


In [None]:
#Remove unwanted columns and check data shape.

data = data[variables]

data.shape

In [None]:
#Check data for 'mTICI Score'.

data['mTICI Score'].value_counts(dropna=False)

In [None]:
#Cast 'mTICI Score' class as ordered categorical.

cat_type = CategoricalDtype(categories=['MT not attempted', '0','1','2a','2b','2c','3'], ordered=True)
data['mTICI Score'].astype(cat_type)

In [None]:
#Check data for 'Number of Passes'.

data['Number of Passes'].value_counts(dropna=False)

In [None]:
#Cast Number of Passes as ordered categorical.

cat_type = CategoricalDtype(categories=['MT not attempted','0','1','2','3','4','5'], ordered=True)
data['Number of Passes'].astype(cat_type)

In [None]:
#Check data for '90-day Modified Rankin Score'.

data['90-day Modified Rankin Score'].value_counts(dropna=False)

In [None]:
#Drop patients with unknown '90-day Modified Rankin Score'.

data = data[data['90-day Modified Rankin Score'].notna()]

In [None]:
#Show NIHSS_SHIFT after dropping patients with unknown '90-day Modified Rankin Score'.

data['90-day Modified Rankin Score'].value_counts(dropna=False)

In [None]:
#Convert '90-day Modified Rankin Score' into categorical data in a column named 'mRS Category'.

data.loc[data['90-day Modified Rankin Score'] <= 2, 'mRS Category'] = 'Low'
data.loc[data['90-day Modified Rankin Score'] > 2, 'mRS Category'] = 'High'

In [None]:
#Show 'mRS Category' as categorical.

data['mRS Category'].value_counts(dropna=False)

In [None]:
#Check data shape.

data.shape

In [None]:
#Save data.

data.to_csv('./DMVO-mRS/dmvo_clean.csv')

In [None]:
#Drop unwanted columns.

drop = ['90-day Modified Rankin Score']
data.drop(drop, axis=1, inplace=True)

In [None]:
#Check data shape.

data.shape

In [None]:
#See all columns with data types.

data_types = data.dtypes

data_types = pd.DataFrame(data_types)

print(data_types)

In [None]:
#See categorical and continuous variables.

print('Numerical columns: {}'.format(list(data.select_dtypes('number').columns)))
print()
print('Categorical columns: {}'.format(list(data.select_dtypes('object').columns)))

In [None]:
#Define numerical and categorical columns.

num_cols = ['Age', 'Age related Admission SI', 'Admission SI ', 'Admission BMI', 'Admission SBP', 'Admission DBP', 'Admission HR', 'Admission RR', 'Admission SpO2', 'Admission Sodium', 'Admission Potassium', 'Admission Glucose ', 'Admission Calcium', 'Admission BUN', 'Admission Creatinine', 'Admission BUN:Creatinine Ratio', 'Admission Hematocrit ', 'Admission Hemoglobin', 'Admission WBC Count', 'Admission Platelet Count', 'Admission Platelet:WBC Ratio', 'Admission PT', 'Admission INR', 'Admission PTT', 'Admission Troponin I', 'Admission NIHSS', 'Admission LAMS', 'Door to CT (minutes)', 'Door to Needle (minutes)', 'Door to Groin Puncture (minutes)', 'Groin puncture to recanalization (minutes)', 'Baseline NCCT ASPECTS', 'Discharge BUN:Creatinine Ratio', 'Discharge Sodium', 'Discharge Potassium', 'Discharge Glucose ', 'Discharge Calcium', 'Discharge BUN', 'Discharge Creatinine', 'Discharge Hematocrit ', 'Discharge Hemoglobin', 'Discharge WBC Count (x 1000)', 'Discharge Platelet Count (x 1000)', 'Discharge Platelets:WBC Ratio', 'Discharge PT', 'Discharge INR', 'Discharge PTT', 'Discharge Troponin', 'Discharge BMI', 'Discharge SBP', 'Discharge DBP', 'Discharge HR', 'Discharge RR', 'Discharge SpO2']

cat_cols = ['Sex', 'Race', 'Smoking', 'Alcohol', 'Hypertension ', 'Dyslipidemia', 'Diabetes', 'Coronary Artery Disease', 'A-Fib', 'History of Malignancy', 'Antiplatelet or Anticoagulation', 'Prior stroke/TIA ', 'HIV', 'HCV', 'Chronic Kidney disease', 'Sleep apnea', 'PVD', 'DVT/PE', 'Occlusion Laterality', 'Occlusion Site', 'Occlusion Proximity', 'Superior vs Inferior', 'Occlusion Segment', 'Hyperdense MCA', 'Mechanical Thrombectomy', 'Number of Passes', 'mTICI Score', 'Hemorrhagic Transformation', 'Type of HT', 'mRS Category']

In [None]:
#Check missing values for numerical columns.

data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

In [None]:
#Define missing values for numerical columns.

missing_num = data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

missing_num = pd.DataFrame(missing_num)

missing_num.columns = ['Value']

missing_num = missing_num[missing_num['Value'] > 0]

print(missing_num.index)

missing_num = missing_num[missing_num['Value'] > 51]

missing_num = list(missing_num.index)

print(missing_num)

In [None]:
#Drop numerical columns with missing values over 25%.

data.drop(missing_num, axis=1, inplace=True)

In [None]:
#Define new numerical columns.

num_cols = [x for x in num_cols if x not in missing_num]
print(num_cols)

In [None]:
#Impute missing numerical values.

num_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

In [None]:
#Check missing values after imputation.

data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

In [None]:
#Check missing values for categorical columns.

data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

In [None]:
#Define missing values for categorical columns.

missing_cat = data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

missing_cat = pd.DataFrame(missing_cat)

missing_cat.columns = ['Value']

missing_cat = missing_cat[missing_cat['Value'] > 0]

print(missing_cat.index)

missing_cat = missing_cat[missing_cat['Value'] > 25]

missing_cat = list(missing_cat.index)

print(missing_cat)

In [None]:
#Drop categorical columns with missing values over 25%.

data.drop(missing_cat, axis=1, inplace=True)

In [None]:
#Define new categorical columns.

cat_cols = [x for x in cat_cols if x not in missing_cat]
print(cat_cols)

In [None]:
#Replace missing categorical values with 'Unknown'.

for col in cat_cols:
    data[col].fillna(value='Unknown', inplace=True)

In [None]:
#Check missing values after imputation.

data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

In [None]:
#Save imputed data.

data.to_csv('./DMVO-mRS/dmvo_imputed.csv')

In [None]:
#RobustScale data.

data[num_cols] = RobustScaler().fit_transform(data[num_cols])

In [None]:
#Normalize data.

data[num_cols] = MinMaxScaler().fit_transform(data[num_cols])

In [None]:
#One hot encoding for categorical values.

data_final = pd.get_dummies(data, columns = cat_cols, drop_first = True)

In [None]:
#Save final data.

data_final.to_csv('./DMVO-mRS/dmvo_final.csv')