In [8]:
import os
import pandas as pd
import numpy as np
import functools
import math
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import RocCurveDisplay, roc_auc_score, f1_score, classification_report

In [9]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [10]:
data = {dataset_name.replace('.xlsx', ''): pd.read_excel(f'Data/{dataset_name}') for dataset_name in os.listdir('Data') if 'xlsx' in dataset_name} 

print(list(data.keys()))

['test_demo', 'test_habits', 'test_health', 'train_demo', 'train_habits', 'train_health']


In [11]:
data_frames = [data['train_demo'], data['train_habits'], data['train_health']]
df = functools.reduce(lambda  left,right: pd.merge(left,right,on=['PatientID'], how='outer'), data_frames).set_index('PatientID')

data_frames = [data['test_demo'], data['test_habits'], data['test_health']]
df_test = functools.reduce(lambda  left,right: pd.merge(left,right,on=['PatientID'], how='outer'), data_frames).set_index('PatientID')

---
# Baseline

- only numeric features

In [12]:
x, y = df.drop(columns = ['Disease']), df['Disease']

xnumeric = x.select_dtypes(include = np.number)

xtrain, xval, ytrain, yval = train_test_split(xnumeric, y, random_state = 111 ,test_size = 0.3, shuffle = True , stratify = y)

xtrain_index = xtrain.index
xval_index = xval.index

model = LogisticRegression(max_iter = 500, random_state = 1)
model.fit(xtrain,ytrain)

ypred = model.predict(xval)

f1_score(yval, ypred)

0.7547169811320754

---
# Preprocessing

In [13]:
def preprocessing(dataframe):

    # fix Birth_Year errors (1869 > 1969)
    dataframe['Birth_Year'] = [i + 100 if i < 1900 else i for i in dataframe['Birth_Year']]

    # add Age column
    dataframe['Age'] = [2022 - i for i in dataframe['Birth_Year']]

    # add Gender column (1: Male, 0: Female)
    # afterwards drop column Name
    dataframe['Gender']  = [1 if i.split(' ')[0] == 'Mr.' else 0 for i in dataframe['Name']]
    dataframe.drop(columns = ['Name'], inplace = True)

    # add column population density
    dataframe['Region'] = [i.lower() for i in dataframe['Region']]
    dataframe['Region_Density'] = [dataframe['Region'].value_counts(normalize = True)[i] for i in dataframe['Region']]

    # encode Smoking_Habit & Exercise to binary (1: Yes, 0: No)
    dataframe['Smoking_Habit'] = [1 if i == 'Yes' else 0 for i in dataframe['Smoking_Habit']]
    dataframe['Exercise'] = [1 if i == 'Yes' else 0 for i in dataframe['Exercise']]

    # impute missing values in column "Education" with mode
    dataframe['Education'].fillna(dataframe['Education'].mode()[0], inplace=True)

    # split x in numeric and categorical features
    xnumeric, xcategorical = dataframe.select_dtypes(include = np.number), dataframe.select_dtypes(exclude = np.number)

    # One-hot-encoding categorical features
    encoder = OneHotEncoder(handle_unknown='ignore')
    x_cat_encoded = pd.DataFrame(encoder.fit_transform(xcategorical).toarray(), columns = encoder.get_feature_names_out(), index = xcategorical.index) 

    # Scale numeric features
    scaler = RobustScaler()
    x_num_scaled = pd.DataFrame(scaler.fit_transform(xnumeric), index = xnumeric.index ,columns = xnumeric.columns)

    xpreprocessed = pd.concat([x_num_scaled, x_cat_encoded], axis = 1)

    return xpreprocessed

In [14]:
x, y = preprocessing(x), y.loc[x.index]

xtrain, xval, ytrain, yval = x.loc[xtrain_index], x.loc[xval_index], y.loc[xtrain_index], y.loc[xval_index]

model = LogisticRegression(max_iter = 500, random_state = 1)
model.fit(xtrain,ytrain)

ypred = model.predict(xval)

f1_score(yval, ypred)

0.8663967611336032

---
# Try better model

In [15]:
model = RandomForestClassifier(random_state = 2)
model.fit(xtrain,ytrain)

ypred = model.predict(xval)

print(f'F1 score with RandomForest: {f1_score(yval, ypred)}')

F1 score with RandomForest: 0.9635627530364373


---
# Try embedded feature selection

In [18]:
from sklearn.feature_selection import SelectFromModel

In [37]:
forest = RandomForestClassifier(random_state = 2)
forest.fit(xtrain, ytrain)
sfm = SelectFromModel(forest, prefit=True)

In [20]:
xtrain_fs = sfm.transform(xtrain)
xval_fs = sfm.transform(xval)

model = RandomForestClassifier(random_state = 2)
model.fit(xtrain_fs, ytrain)

ypred = model.predict(xval_fs)

print(f'F1 score with SelectFromModel: {f1_score(yval, ypred)}')

F1 score with SelectFromModel: 0.9672131147540983


# Try RFE

In [16]:
model = RandomForestClassifier(random_state = 2)
rfe = RFECV(model, cv = 5, scoring = 'f1')
rfe.fit(xtrain, ytrain)
opt_features = list(rfe.get_feature_names_out(input_features = list(xtrain.columns)))

In [17]:
xtrain_fs = rfe.transform(xtrain)
xval_fs = rfe.transform(xval)

model = RandomForestClassifier(random_state = 2)
model.fit(xtrain_fs, ytrain)

ypred = model.predict(xval_fs)

print(f'F1 score with RFE: {f1_score(yval, ypred)}')

F1 score with RFE: 0.9877551020408164


---
# Hyperparameter tuning

In [34]:
param_grid = {
                'bootstrap': [True, False],
                'max_depth': [50, 60, 70, 80, 90, 100, None],
                'max_features': ['auto', 'sqrt'],
                'n_estimators': [200, 400, 600, 800, 1000]
            }

model = RandomForestClassifier(random_state = 2)
search = RandomizedSearchCV(estimator = model, param_distributions = param_grid, scoring = 'f1', cv = 3, random_state = 3, n_jobs = -1)
search.fit(xtrain_fs, ytrain)

print(str(search.best_params_).replace('{','').replace('}','').replace("'","").replace(':','='))

n_estimators= 600, max_features= sqrt, max_depth= 70, bootstrap= True


In [29]:
model = RandomForestClassifier(**search.best_params_, random_state = 2)
model.fit(xtrain_fs, ytrain)

ypred = model.predict(xval_fs)

print(f'F1 score with RandomizedSearch: {f1_score(yval, ypred)}')

F1 score with RandomizedSearch: 0.97119341563786


# Try GridSearch instead of RandomizedSearch

In [30]:
from sklearn.model_selection import GridSearchCV

In [38]:
forest = RandomForestClassifier(random_state = 2)
search = GridSearchCV(forest, param_grid, n_jobs = -1)
search.fit(xtrain_fs, ytrain)

GridSearchCV(estimator=RandomForestClassifier(random_state=2), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [50, 60, 70, 80, 90, 100, None],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [200, 400, 600, 800, 1000]})

In [None]:
model = RandomForestClassifier(**search.best_params_, random_state = 2)
model.fit(xtrain_fs, ytrain)

ypred = model.predict(xval_fs)

print(f'F1 score with GridSearch: {f1_score(yval, ypred)}')

F1 score with GridSearch: 0.9794238683127573
