# TITLE PENDING
### Bachelor's thesis by Lukas Schießer

In [27]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats

### Preprocessing

In [28]:
pd.options.display.max_columns = None
os.getcwd()
os.chdir("C:/code/Uni/ba/blood-exams")
data = pd.read_excel('covid_study_v2.xlsx')

In [29]:
data.insert(0,'male',data["GENDER"] == "M")
data.insert(0,'female',data["GENDER"] == "F")
data.drop(columns="GENDER", inplace=True)
data['Lymphocytes'] = pd.to_numeric(data['Lymphocytes'], errors='coerce')
data.loc[data["AGE"] < 18, "AGE"] = np.NaN
data.head()

Unnamed: 0,female,male,AGE,WBC,Platelets,Neutrophils,Lymphocytes,Monocytes,Eosinophils,Basophils,CRP,AST,ALT,ALP,GGT,LDH,SWAB
0,False,True,56.0,2.9,128.0,1.9,0.8,0.2,0.0,0.0,29.0,36.0,18.0,43.0,21.0,257.0,1
1,False,True,56.0,3.5,151.0,2.1,0.9,0.4,0.0,0.0,16.5,25.0,14.0,50.0,17.0,207.0,1
2,False,True,72.0,4.6,206.0,,,,,,193.7,31.0,22.0,,,,1
3,False,True,72.0,16.5,316.0,14.0,1.2,0.3,0.0,0.0,318.7,96.0,33.0,80.0,42.0,651.0,1
4,False,True,77.0,4.9,198.0,,,,,,,,,,,,1


In [30]:
data.describe()

Unnamed: 0,AGE,WBC,Platelets,Neutrophils,Lymphocytes,Monocytes,Eosinophils,Basophils,CRP,AST,ALT,ALP,GGT,LDH,SWAB
count,277.0,277.0,277.0,209.0,208.0,209.0,209.0,208.0,273.0,277.0,266.0,131.0,136.0,194.0,279.0
mean,61.776173,8.553069,226.53213,6.200478,1.186538,0.605742,0.055024,0.014423,90.889011,54.202166,44.917293,89.89313,82.477941,380.448454,0.634409
std,17.815695,4.855353,101.174178,4.172581,0.806442,0.410049,0.132237,0.039116,94.421406,57.612797,45.503232,89.089864,132.702506,193.98352,0.482461
min,18.0,1.1,20.0,0.5,0.2,0.0,0.0,0.0,0.1,11.0,9.0,34.0,10.0,98.0,0.0
25%,49.0,5.1,163.0,3.5,0.7,0.4,0.0,0.0,21.4,27.0,21.0,57.5,23.75,243.25,0.0
50%,64.0,7.1,205.0,5.1,1.0,0.5,0.0,0.0,54.2,36.0,31.0,71.0,41.0,328.0,1.0
75%,76.0,10.7,271.0,7.5,1.4,0.7,0.1,0.0,129.1,60.0,46.0,90.0,83.0,454.5,1.0
max,98.0,29.2,620.0,26.4,7.2,3.2,1.3,0.3,478.0,550.0,335.0,838.0,839.0,1195.0,1.0


In [40]:
alpha = 0.05
for col in data.columns[2:-1]:
    # TODO: Perform a Shapiro-Wilk normality test
    stat, p = stats.shapiro(data[col].dropna())

    print('{}: stat={:0.3f}, p={:0.3f}; {}'.format(col, stat, p, 'Probably Normal' if p > alpha else 'Probably not Normal'))

AGE: stat=0.976, p=0.000; Probably not Normal
WBC: stat=0.873, p=0.000; Probably not Normal
Platelets: stat=0.930, p=0.000; Probably not Normal
Neutrophils: stat=0.838, p=0.000; Probably not Normal
Lymphocytes: stat=0.785, p=0.000; Probably not Normal
Monocytes: stat=0.811, p=0.000; Probably not Normal
Eosinophils: stat=0.457, p=0.000; Probably not Normal
Basophils: stat=0.395, p=0.000; Probably not Normal
CRP: stat=0.836, p=0.000; Probably not Normal
AST: stat=0.556, p=0.000; Probably not Normal
ALT: stat=0.629, p=0.000; Probably not Normal
ALP: stat=0.420, p=0.000; Probably not Normal
GGT: stat=0.500, p=0.000; Probably not Normal
LDH: stat=0.877, p=0.000; Probably not Normal


In [41]:
stat, p = stats.shapiro(data.dropna())
print('stat={:0.3f}, p={:0.3f}; {}'.format(stat, p, 'Probably Normal' if p > alpha else 'Probably not Normal'))

stat=0.570, p=0.000; Probably not Normal


### Model preparation

In [None]:
# import all necessary tools
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

In [None]:
X, y = data[data.columns[:-1]], data["SWAB"]
# prepare 5-fold nested cross validation by defining outer loop of nested cv
cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)
# question 1: paper states that missing data imputation done during nested cross validation,
# but Imputer has to be trained. Train it on whole dataset and then use it for every nested cross-validation subset or train it on subsets?
# start for loop by invoking cv_outer.split(X, y) to get indices for subsetting training and test data
for train_index, test_index in cv_outer.split(X, y):
    X_train, X_test = X[train_index,:], X[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    # do imputation here?
    # define inner loop of nested cross validation
    cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)
    # instantiate model for hyperparameter search by means of GridSearch
    model = RandomForestClassifier(random_state=1)
    # define the search space over which GridSearch should 
    space = {}
    space['n_estimators'] = [10, 100, 500]
    space['max_features'] = [2, 4, 6]
    # define GridSearch for nested cv
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    yhat = best_model.predict(X_test)
    # evaluate the model

[[1]](https://stats.stackexchange.com/questions/437487/cross-validation-and-multiple-imputation-for-missing-data) seems to indicate that training should happen on subset