# Overview

1. [Exploratory Data Analysis](#Exploratory-Data-Analysis)
2. [Baseline Model](#Baseline-Model)
3. [Preprocessing](#Preprocessing)
4. [Feature Selection](#Feature-Selection)
5. [Model Selection](#Model-Selection)

---
### Imports

In [None]:
import os
import pandas as pd
import numpy as np
import functools
import itertools
import math
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.model_selection import train_test_split, StratifiedKFold, learning_curve, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, RocCurveDisplay

---
### Load Data

In [None]:
# create dictionary with filename:data as key:value pairs
data = {dataset_name.replace('.xlsx', ''): pd.read_excel(f'Data/{dataset_name}') for dataset_name in os.listdir('Data') if 'xlsx' in dataset_name}

# merge train datasets on PatientID
data_frames = [data['train_demo'], data['train_habits'], data['train_health']]
df = functools.reduce(lambda  left,right: pd.merge(left,right,on=['PatientID'], how='outer'), data_frames).set_index('PatientID')

# merge test datasets on PatientID
data_frames = [data['test_demo'], data['test_habits'], data['test_health']]
df_test = functools.reduce(lambda  left,right: pd.merge(left,right,on=['PatientID'], how='outer'), data_frames).set_index('PatientID')

---
### Exploratory Data Analysis

In [None]:
# first look on the data
df.sample(5)

In [None]:
# list datatypes
df.dtypes

In [None]:
# check for missing values and empty strings
pd.concat([df.isnull().sum(),df.eq('').sum()],keys=['Nulls','Empty Strings'],axis=1)

In [None]:
# check for duplicated rows
df.duplicated().sum()

In [None]:
# descriptive statistics
df.describe(include="all").T

In [None]:
sns.set()

# plot pairwise relationships and densities
sns.pairplot(df, hue = 'Disease', markers = ['o', 's'])
plt.show()

In [None]:
# select numeric features (exclude target array)
df_numeric_features = list(df.select_dtypes(include = np.number).columns)
df_numeric_features.remove('Disease')

# boxplot of numeric features
fig, ax = plt.subplots(math.ceil(len(df_numeric_features)/4),4, figsize = (15,10))
for ax, feat in zip(ax.flatten(), df_numeric_features):
    ax.boxplot(df[feat], notch = True, patch_artist = True)
    ax.set_title(feat)

plt.show()

In [None]:
# select categorical features (exclude name column)
df_categorical_features = list(df.select_dtypes(exclude = np.number).columns)
df_categorical_features.remove('Name')

# stacked barplot of categorical features (with regard to target value)
fig, ax = plt.subplots(len(df_categorical_features), figsize = (10,50))
for ax, feat in zip(ax.flatten(), df_categorical_features):
    pivot_tbl = df[[feat, 'Disease']].pivot_table(index = feat, columns = ['Disease'],  aggfunc=len)
    graph = pivot_tbl.plot(kind='barh', stacked=True, title=feat, ax=ax)
    ax.set_ylabel('')
    for c in ax.containers:
        ax.bar_label(c, label_type='center')

plt.show()

---
### Baseline Model

- LogisticRegression with only numeric values

In [None]:
# split to data:target
x, y = df.drop(columns = ['Disease']), df['Disease']

In [None]:
# select numeric features
initial_xnumeric = x.select_dtypes(include = np.number)
initial_xnumeric_cols = initial_xnumeric.columns

# stratified train-test split
xtrain, xval, ytrain, yval = train_test_split(initial_xnumeric, y, random_state = 0 ,test_size = 0.2, shuffle = True , stratify = y)

# save train split indices to track performance on the same validation set during the project
train_indices = xtrain.index

In [None]:
model = LogisticRegression(max_iter = 500, random_state = 1)
model.fit(xtrain,ytrain)

ypred = model.predict(xval)

baseline_f1 = f1_score(yval, ypred)

print(f'F1 score: {baseline_f1}')

---
### Preprocessing

In [None]:
def preprocessing(dataframe):

    # fix Birth_Year errors. here we assume that the outliers are typos
    # the numbers 8 & 9 are pretty close to each other on a keyboard. therefore we add 100 to every year that's smaller than 1900
    # (e.g. 1869 --> 1969)
    dataframe['Birth_Year'] = [i + 100 if i < 1900 else i for i in dataframe['Birth_Year']]

    # add Age column
    dataframe['Age'] = [2022 - i for i in dataframe['Birth_Year']]

    # add Gender column (1: Male, 0: Female)
    # afterwards drop column Name
    dataframe['Gender']  = [1 if i.split(' ')[0] == 'Mr.' else 0 for i in dataframe['Name']]
    dataframe.drop(columns = ['Name'], inplace = True)

    # add BMI columns
    dataframe['BMI'] = [i/(j/100)**2 for i, j in zip(dataframe['Weight'], dataframe['Height'])]
    dataframe['BMI_Cateogry'] = [0 if i < 18.5 else 1 if i < 25 else 2 if i < 30 else 3 if i < 35 else 4 for i in dataframe['BMI']]

    # set all region values to lowercase
    dataframe['Region'] = [i.lower() for i in dataframe['Region']]

    # handle High_Cholesterol, Blood_Pressure, Physical_Health
    # flooring and capping outliers
    for col in ['High_Cholesterol', 'Blood_Pressure', 'Physical_Health']:
        q25, q75, iqr = dataframe[col].quantile(.25), dataframe[col].quantile(.75), dataframe[col].quantile(.75) - dataframe[col].quantile(.25)
        upper_lim = q75 + 1.5 * iqr
        lower_lim = q25 - 1.5 * iqr
        dataframe[col] = [upper_lim if i > upper_lim else lower_lim if i < lower_lim else i for i in dataframe[col]]

    # encode Smoking_Habit & Exercise to binary (1: Yes, 0: No)
    dataframe['Smoking_Habit'] = [1 if i == 'Yes' else 0 for i in dataframe['Smoking_Habit']]
    dataframe['Exercise'] = [1 if i == 'Yes' else 0 for i in dataframe['Exercise']]    
    
    # manual encoding of specific feature to don't mess up the ranking
    # all of these features have a specific ranking structure
    # another advantage is the avoidance of dimensionality increase through One-hot-encoding all categorical features

    # impute missing values in column "Education" with mode
    # encode Education
    dataframe['Education'].fillna(dataframe['Education'].mode()[0], inplace=True)
    edu_map = {
            'I never attended school / Other'               : 0,
            'Elementary School (1st to 9th grade)'          : 1,
            'High School Incomplete (10th to 11th grade)'   : 2,
            'High School Graduate'                          : 3,
            'University Incomplete (1 to 2 years)'          : 4,
            'University Complete (3 or more years)'         : 5
            }
    dataframe['Education'] = [edu_map[i] if i in edu_map.keys() else np.nan for i in dataframe['Education']]

    drink_map = {
            'I do not consume any type of alcohol'          : 0,
            'I consider myself a social drinker'            : 1,
            'I usually consume alcohol every day'           : 2
            }
    dataframe['Drinking_Habit'] = [drink_map[i] if i in drink_map.keys() else np.nan for i in dataframe['Drinking_Habit']]

    fruit_map = {
            'Less than 1. I do not consume fruits every day.'   : 0,
            '1 to 2 pieces of fruit in average'                 : 1,
            '3 to 4 pieces of fruit in average'                 : 2,
            '5 to 6 pieces of fruit in average'                 : 3,
            'More than six pieces of fruit'                     : 4
            }
    dataframe['Fruit_Habit'] = [fruit_map[i] if i in fruit_map.keys() else np.nan for i in dataframe['Fruit_Habit']]

    water_map = {
            'Less than half a liter'                            : 0,
            'More than half a liter but less than one liter'    : 1,
            'Between one liter and two liters'                  : 2
            }
    dataframe['Water_Habit'] = [water_map[i] if i in water_map.keys() else np.nan for i in dataframe['Water_Habit']]

    checkup_map = {
            'Not sure'                                          : 0,
            'More than 3 years'                                 : 1,
            'Less than 3 years but more than 1 year'            : 2,
            'Less than three months'                            : 3
            }
    dataframe['Checkup'] = [checkup_map[i] if i in checkup_map.keys() else np.nan for i in dataframe['Checkup']]

    diabetes_map = {
            'I do have diabetes'                                                            : 0,
            'I have/had pregnancy diabetes or borderline diabetes'                          : 1,
            "I don't have diabetes, but I have direct family members who have diabetes."    : 2,
            'Neither I nor my immediate family have diabetes.'                              : 3
            }
    dataframe['Diabetes'] = [diabetes_map[i] if i in diabetes_map.keys() else np.nan for i in dataframe['Diabetes']]

    return dataframe

In [None]:
x_preprocessed = preprocessing(x.copy())

---
### Check for scalers

In [None]:
scalers = [StandardScaler(), MinMaxScaler(), MinMaxScaler(feature_range = (-1,1)), RobustScaler()]

In [None]:
def test_scaler_encoder(X,Y,scaler_options,classifier):

        ranking = {'scaler':[],'score':[]}

        for scl in scaler_options:

                # create K-fold crossvalidation
                skf = StratifiedKFold(shuffle=True, random_state = 0)

                # scores list for every k-fold f1-score

                scores = []
                # calculate f1-score for each train-test split with the given scaler:encoder combination
                for train_index, val_index in skf.split(X, Y):
                        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                        y_train, y_val = Y.iloc[train_index], Y.iloc[val_index]

                        numeric_features, categorical_features = X_train.select_dtypes(include = np.number).columns, X_train.select_dtypes(exclude = np.number).columns

                        numeric_transformer = Pipeline(steps=[
                                ('scaler', scl)
                                ])

                        categorical_transformer = Pipeline(steps=[
                                ('encoder', OneHotEncoder(handle_unknown='ignore'))
                                ])

                        preprocessor = ColumnTransformer(transformers=[
                                ('cat', categorical_transformer, categorical_features),
                                ('num', numeric_transformer, numeric_features)
                                ])

                        model = Pipeline(steps=[
                                ("preprocessor", preprocessor),
                                ("classifier", classifier)
                                ])
                        
                        model.fit(X_train, y_train)
                        scores.append(round(f1_score(y_val, model.predict(X_val)),4))

                ranking['scaler'].append(scl)
                ranking['score'].append(np.mean(scores))

        return pd.DataFrame(ranking).sort_values('score', ascending=False)

In [None]:
test_scaler_encoder(x_preprocessed, y, scalers, LogisticRegression(max_iter=500, random_state=1))

In [None]:
test_scaler_encoder(x_preprocessed, y, scalers, SVC(random_state=1))

In [None]:
test_scaler_encoder(x_preprocessed, y, scalers, MLPClassifier(hidden_layer_sizes=(10,2), max_iter=1500, random_state=1))

In [None]:
test_scaler_encoder(x_preprocessed, y, scalers, DecisionTreeClassifier(random_state=1))

In [None]:
test_scaler_encoder(x_preprocessed, y, scalers, RandomForestClassifier(random_state=1))

---
### Train/Test Split

In [None]:
xtrain, xval = x_preprocessed.loc[train_indices], x_preprocessed.loc[~x_preprocessed.index.isin(train_indices)]
ytrain, yval = y.loc[train_indices], y.loc[~x_preprocessed.index.isin(train_indices)]

---
### Prepare Train/Test Split (One-Hot-Encode, Scale) 

In [None]:
def encode_scale(dataframe, reset_fit=False):

    '''
    One-hot-encode and scales dataframe
    If reset_fit set to TRUE encoder & scaler get refitted
    '''

    # fit_transform if no scaler and one-hot-encoder is already fitted
    if not('ohe' in globals() and 'scl' in globals()) or reset_fit == True:
        xnumeric, xcategorical = dataframe.select_dtypes(include = np.number), dataframe.select_dtypes(exclude = np.number)
        # One-hot-encoding
        # set encoder to global variable to re-use it afterwards
        global ohe
        ohe = OneHotEncoder(handle_unknown='ignore')
        categorical_encoded = pd.DataFrame(ohe.fit_transform(xcategorical).toarray(), columns = ohe.get_feature_names_out(), index = xcategorical.index)
        dataframe_encoded = pd.concat([xnumeric, categorical_encoded], axis = 1)
        # Robust-scaling
        # set scaler to global variable to re-use it afterwards
        global scl
        scl = RobustScaler()
        dataframe_scaled = pd.DataFrame(scl.fit_transform(dataframe_encoded), columns = dataframe_encoded.columns, index = dataframe_encoded.index)
        print('fit & transform successful...')
        return dataframe_scaled

    # transform only if scaler and one-hot-encoder is already fitted
    else:
        xnumeric, xcategorical = dataframe.select_dtypes(include = np.number), dataframe.select_dtypes(exclude = np.number)
        # One-hot-encoding
        categorical_encoded = pd.DataFrame(ohe.transform(xcategorical).toarray(), columns = ohe.get_feature_names_out(), index = xcategorical.index)
        dataframe_encoded = pd.concat([xnumeric, categorical_encoded], axis = 1)
        #Robust-scaling
        dataframe_scaled = pd.DataFrame(scl.transform(dataframe_encoded), columns = dataframe_encoded.columns, index = dataframe_encoded.index)

        print('transform successful...')
        return dataframe_scaled

In [None]:
xtrain_prepro = encode_scale(xtrain, reset_fit=True)

In [None]:
xval_prepro = encode_scale(xval)

---
### Feature Selection - Filter Methods

In [None]:
def cor_heatmap(cor):
    plt.figure(figsize=(10,7))
    sns.heatmap(data = cor, annot = True, cmap = plt.cm.Reds, fmt='.2')
    plt.show()

cor_spearman = cor_heatmap(xtrain_prepro[initial_xnumeric_cols].corr("spearman"))

In [None]:
def TestIndependence(X,Y,var,alpha=0.05):        
    dfObserved = pd.crosstab(Y,X) 
    chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
    dfExpected = pd.DataFrame(expected, columns=dfObserved.columns, index = dfObserved.index)
    if p<alpha:
        result= f"{var} is IMPORTANT for Prediction"
    else:
        result= f"{var} is NOT an important predictor. (Discard {var} from model)"
    print(result)

In [None]:
initial_xcategorical_col = [i for i in x.columns if i not in initial_xnumeric_cols]
initial_xcategorical_col.remove('Name')
initial_xcategorical_col.remove('Region')

for var in initial_xcategorical_col:
    TestIndependence(xtrain_prepro[var],ytrain, var)

### Feature Selection - Wrapper Methods

In [None]:
model = RandomForestClassifier(random_state = 1)
rfe = RFECV(model, cv = 5, scoring = 'f1', n_jobs = -1)
rfe.fit(xtrain_prepro, ytrain)
opt_features = list(rfe.get_feature_names_out(input_features = list(xtrain_prepro.columns)))
print(f'Best features: {[i for i in opt_features]}')

In [None]:
xtrain_rfe = rfe.transform(xtrain_prepro)
xval_rfe = rfe.transform(xval_prepro)

---
### Model Selection

In [None]:
def roc_auc(classifiers, xtrain, ytrain, xval, yval):
    '''
    Plots ROC/AUC
    classifiers input --> {'Logistic Regression': LogisticRegression(),...}
    '''

    fig, ax = plt.subplots(1, figsize=(15, 10))
    for name, clf in classifiers.items():
        clf.fit(xtrain, ytrain)
        RocCurveDisplay.from_estimator(clf, xval, yval, ax=ax, name=name)
    ax.set_title('Receiver Operating Characteristic (ROC)')
    ax.plot([0,1], [0,1], linestyle='--')
    plt.show()

classifiers = {
                'Logistic Regression': LogisticRegression(),
                'Support Vector Machine': SVC(),
                'Neural Network': MLPClassifier(max_iter = 1000),
                'Decision Tree': DecisionTreeClassifier(),
                'Random Forest': RandomForestClassifier()
                }

In [None]:
roc_auc(classifiers, xtrain_rfe, ytrain, xval_rfe, yval)

### Lets check the learning curve for over-fitting

In [None]:
def learning_curves(estimator, X, Y):
    '''
    Plots learning curve
    '''

    train_sizes, train_scores, validation_scores = learning_curve(estimator, X, Y, cv = 5, scoring = 'f1', train_sizes = np.arange(.05,1,.05))
    train_mean, test_mean, train_std, test_std = np.mean(train_scores, axis=1), np.mean(validation_scores, axis=1), np.std(train_scores, axis=1), np.std(validation_scores, axis=1)

    plt.subplots(1, figsize=(10,10))
    plt.plot(train_sizes, train_mean, color='salmon',  label='Training score', marker = 'o')
    plt.plot(train_sizes, test_mean, color='olive', label='Cross-validation score', marker = 's')
    plt.title('Learning Curve')
    plt.xlabel('Training Set Size')
    plt.ylabel('F1 Score')
    plt.legend(loc='best')
    plt.show()

learning_curves(RandomForestClassifier(), xtrain_rfe, ytrain)

### Slightly overfitting --> Hyperparameter tuning could solve this problem

---
### Hyperparameter tuning

In [None]:
param_grid = {
                'criterion': ['gini', 'entropy'],
                'bootstrap': [True, False],
                'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                'max_features': ['auto', 'sqrt'],
                'min_samples_leaf': [1, 2, 4],
                'min_samples_split': [2, 5, 10],
                'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
            }

model = RandomForestClassifier(random_state = 1)
rand_search = RandomizedSearchCV(estimator = model, param_distributions = param_grid, scoring = 'f1', n_iter = 100, cv = 3, verbose=2, random_state=3, n_jobs = -1)
rand_search.fit(xtrain_rfe, ytrain)

print(str(rand_search.best_params_).replace('{','').replace('}','').replace("'","").replace(':','='))

In [None]:
model = RandomForestClassifier(**rand_search.best_params_, random_state = 1)
model.fit(xtrain_rfe, ytrain)

ypred = model.predict(xval_rfe)

print(f'F1 score: {f1_score(yval, ypred)}')

---
### Submission 

**1** - Train on full dataset

In [None]:
# apply encoding & scaling
# reset_fit set to True for new fit_transform (including all train observations)
xtrain_prepro_full = encode_scale(preprocessing(x), reset_fit = True)

In [None]:
# apply feature selection on full train dataset
xtrain_rfe_full = rfe.transform(xtrain_prepro_full)

In [None]:
# fit model with best hyperparameters
model = RandomForestClassifier(**rand_search.best_params_, random_state = 1)
model.fit(xtrain_rfe_full, y)

**2** - Prepare test dataset and predict

In [None]:
# apply preprocessing function and encode_scale function to test dataset
xtest_prepro = encode_scale(preprocessing(df_test))

In [None]:
xtest_rfe = rfe.transform(xtest_prepro)

In [None]:
ypred = model.predict(xtest_rfe)

In [None]:
df_submission = pd.concat([pd.Series(df_test.index),pd.Series(ypred)], axis = 1)
df_submission.rename(columns = {0:'Disease'}, inplace = True)
df_submission

In [None]:
df_submission.to_csv('Group01_Final.csv', index = False)