# DARWIN (Diagnosis AlzheimeR WIth haNdwriting) Clasification

## 1. Libraries

In [39]:
#!pip install graphviz

In [40]:
import time

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import sklearn
from sklearn import set_config
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler,  LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split,  KFold, cross_validate
from sklearn.metrics import  confusion_matrix, roc_auc_score


set_config(transform_output = "pandas")

## 2. Data

In [41]:
file_dir = 'C:/Users/mcabo/Desktop/Proyects/Machine Learning/data'
file_name =  "/DARWIN.csv"
df = pd.read_csv(file_dir + file_name, sep = ",")

## 3. Functions

#### Feature enginering

In [42]:
def detect_outliers(df):
    outlier_vars = []
    
    summary = df.describe()
    IQR = summary.loc['75%'] - summary.loc['25%']
    
    lower_bound = summary.loc['25%'] - 1.5 * IQR
    upper_bound = summary.loc['75%'] + 1.5 * IQR
    
    for col in df.select_dtypes(include='number').columns:
        outliers_lower = df[col] < lower_bound[col]
        outliers_upper = df[col] > upper_bound[col]
        
        if outliers_lower.any() or outliers_upper.any():
            outlier_vars.append(col)
    
    return outlier_vars


def numeric_lists_transf(df, num_cols):
    minmaxscaler_vars = []
    log_vars = []
    
    summary = df.describe()
    IQR = summary.loc['75%'] - summary.loc['25%']
    
    lower_bound = summary.loc['25%'] - 1.5 * IQR
    upper_bound = summary.loc['75%'] + 1.5 * IQR
    
    for col in num_cols:
        outliers_lower = df[col] < lower_bound[col]
        outliers_upper = df[col] > upper_bound[col]
        
        if outliers_lower.any() or outliers_upper.any():
            log_vars.append(col)
        else:
            minmaxscaler_vars.append(col)

    return log_vars, minmaxscaler_vars

#### Modelling

In [43]:
def predictions_cl(X_test, y_test, X_val, y_val, Algorithm, TARGET):
    y_test_pred = pd.DataFrame(Algorithm.predict(X_test), index=y_test.index, columns=[TARGET])
    acc_test = Algorithm.score(X_test, y_test)
    acc_val = Algorithm.score(X_val, y_val)
    f1 = f1_score(y_test, y_test_pred)
    y_proba = Algorithm.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    return print(f'Accuracy test: {round(acc_test, 3)}, Accuracy validation: {round(acc_val, 3)}, F1 score: {round(f1, 3)}, AUC: {round(auc, 3)}')

def AUC_pred(X, Y, algorithm):
    y_proba = algorithm.predict_proba(X)[:, 1]
    auc = roc_auc_score(Y, y_proba)
    return print("AUC:", auc)

def paint_confusion_matrix(X_test, y_test, algorithm):
    y_pred = algorithm.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', cbar=False)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    return plt.show()

def paint_roc_curve(X_test, y_test, algorithm):
    y_proba = algorithm.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_proba)

    plt.clf()
    plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curve')
    plt.show()

def models_comparison_roc_curve(X_train, y_train, X_test, y_test, model_1, model_2, model_3):

    models = [ 
        (model_1),
        (model_2),
        (model_3)
    ]
    plt.clf()
    for model in models:
        model_name = model[0]
        model_instance = model[1]
        model_instance.fit(X_train, np.ravel(y_train))
        predictions = model_instance.predict_proba(X_test)[:,1]
        auc_score = metrics.roc_auc_score(y_test, predictions)
        print('ROC AUC Score for {}: {}'.format(model_name, round(auc_score, 3)))
        fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
        plt.plot(fpr, tpr, label='ROC Curve for {} - Area: {:2f}'.format(model_name, auc_score))
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc="lower right")
    plt.title('ROC curve')
    return plt.show()

def validation_strategy_cl(dataframe, TARGET):
    X_train_, X_val, y_train_, y_val = train_test_split(dataframe.drop(TARGET, axis=1), dataframe[TARGET], test_size=0.20, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X_train_, y_train_, test_size = 0.20, random_state = 52)
    return X_train, X_test, y_train, y_test, X_val, y_val



In [44]:
#df.columns

In [45]:
df = df.set_index('ID')

In [46]:
len(df.select_dtypes(include = 'number').columns) == len(df.drop('class', axis = 1).columns)

True

## 4. Preprocessing

In [47]:
target = 'class'

In [48]:
numeric_vars = df.select_dtypes(include = 'number').columns

In [49]:
vars = df.drop('class', axis = 1)

In [50]:
df.shape

(174, 451)

In [51]:
df[target].value_counts()

class
P    89
H    85
Name: count, dtype: int64

In [52]:
df_t = df

In [53]:
df_t.shape

(174, 451)

In [54]:
le = LabelEncoder()
df_t['target_encoded'] = le.fit_transform(df['class'])
df_t.drop('class', axis = 1, inplace = True)

  df_t['target_encoded'] = le.fit_transform(df['class'])


In [55]:
def get_pipe(model):

    ct_ = ColumnTransformer(transformers = [
        ('scaler', MinMaxScaler(), numeric_vars)
    ])

    pipe = Pipeline(steps = [
        ('ct', ct_),
        ('model', model)
    ])

    return pipe

## 5. Testing algorithms

### 5.1. Decision Tree Cl

In [56]:
X_train = df_t.drop('target_encoded', axis = 1)
y_train = df_t['target_encoded']

In [57]:
kfold = KFold(n_splits= 5, shuffle = True, random_state= 42)
pipe_dt = get_pipe(DecisionTreeClassifier())

scores = cross_validate(pipe_dt, X_train, y_train, cv=kfold, scoring=['accuracy', 'roc_auc', 'f1'], return_train_score=False)

for metric in scores.keys():
    print(f"{metric} scores for each fold: {scores[metric]}")
    print(f"Mean {metric} score: {round(np.mean(scores[metric]), 3)}")
    print(f"Standard desviation of {metric}: {np.std(scores[metric])}\n")

fit_time scores for each fold: [0.09310102 0.08054137 0.07711816 0.14114022 0.07667685]
Mean fit_time score: 0.094
Standard desviation of fit_time: 0.02444924139993055

score_time scores for each fold: [0.08053708 0.07052326 0.05817294 0.08053207 0.0632782 ]
Mean score_time score: 0.071
Standard desviation of score_time: 0.00900486255369045

test_accuracy scores for each fold: [0.88571429 0.8        0.82857143 0.71428571 0.64705882]
Mean test_accuracy score: 0.775
Standard desviation of test_accuracy: 0.0846258124963538

test_roc_auc scores for each fold: [0.89166667 0.80228758 0.83169935 0.73190789 0.65277778]
Mean test_roc_auc score: 0.782
Standard desviation of test_roc_auc: 0.08260262403504963

test_f1 scores for each fold: [0.89473684 0.81081081 0.84210526 0.66666667 0.66666667]
Mean test_f1 score: 0.776
Standard desviation of test_f1: 0.09336755009915133



### 5.2. Random Forest Cl

In [58]:
pipe_rf = get_pipe(RandomForestClassifier())

scores = cross_validate(pipe_rf, X_train, y_train, cv=kfold, scoring=['accuracy', 'roc_auc', 'f1'], return_train_score=False)

for metric in scores.keys():
    print(f"{metric} scores for each fold: {scores[metric]}")
    print(f"Mean {metric} score: {round(np.mean(scores[metric]), 3)}")
    print(f"Standard desviation of {metric}: {np.std(scores[metric])}\n")

fit_time scores for each fold: [0.32223248 0.28838563 0.28790784 0.28713989 0.28749633]
Mean fit_time score: 0.295
Standard desviation of fit_time: 0.013806286471994502

score_time scores for each fold: [0.07613277 0.11902094 0.06396174 0.06175876 0.06412268]
Mean score_time score: 0.077
Standard desviation of score_time: 0.021608463362389935

test_accuracy scores for each fold: [0.88571429 0.97142857 0.82857143 0.82857143 0.88235294]
Mean test_accuracy score: 0.879
Standard desviation of test_accuracy: 0.05231617198882343

test_roc_auc scores for each fold: [0.96333333 0.97712418 0.92810458 0.95230263 0.95138889]
Mean test_roc_auc score: 0.954
Standard desviation of test_roc_auc: 0.01613130612013196

test_f1 scores for each fold: [0.89473684 0.97142857 0.82352941 0.83333333 0.88888889]
Mean test_f1 score: 0.882
Standard desviation of test_f1: 0.052903066912003055



### 5.3. Gradient Boosting Cl

In [59]:
pipe_gb = get_pipe(GradientBoostingClassifier())

scores = cross_validate(pipe_gb, X_train, y_train, cv=kfold, scoring=['accuracy', 'roc_auc', 'f1'], return_train_score=False)

for metric in scores.keys():
    print(f"{metric} scores for each fold: {scores[metric]}")
    print(f"Mean {metric} score: {round(np.mean(scores[metric]), 3)}")
    print(f"Standard desviation of {metric}: {np.std(scores[metric])}\n")

fit_time scores for each fold: [2.62987137 2.60240006 2.62759256 2.64353824 2.69685936]
Mean fit_time score: 2.64
Standard desviation of fit_time: 0.031355321811714874

score_time scores for each fold: [0.05231237 0.04734278 0.05144453 0.05468965 0.04580307]
Mean score_time score: 0.05
Standard desviation of score_time: 0.003273970900899572

test_accuracy scores for each fold: [0.88571429 0.88571429 0.88571429 0.77142857 0.79411765]
Mean test_accuracy score: 0.845
Standard desviation of test_accuracy: 0.050938512055724965

test_roc_auc scores for each fold: [0.97666667 0.97385621 0.97385621 0.87828947 0.90277778]
Mean test_roc_auc score: 0.941
Standard desviation of test_roc_auc: 0.042011142830772376

test_f1 scores for each fold: [0.89473684 0.88888889 0.88235294 0.75       0.81081081]
Mean test_f1 score: 0.845
Standard desviation of test_f1: 0.05654819263704897



### 5.4 Testing algorithms conclusion

I'll select Random Forest for classification model