# Project ALZHEIMER #

<br> <br>The DARWIN dataset includes handwriting data from 174 participants. The classification task consists in distinguishing Alzheimer’s disease patients from healthy people.

Creator: Francesco Fontanella

Source: https://archive.ics.uci.edu/dataset/732/darwin

The DARWIN dataset was created to allow researchers to improve the existing machine-learning methodologies for the prediction of Alzheimer's disease via handwriting analysis.

Citation Requests/Acknowledgements

N. D. Cilia, C. De Stefano, F. Fontanella, A. S. Di Freca, An experimental protocol to support cognitive impairment diagnosis by using handwriting analysis, Procedia Computer Science 141 (2018) 466–471. https://doi.org/10.1016/j.procs.2018.10.141

N. D. Cilia, G. De Gregorio, C. De Stefano, F. Fontanella, A. Marcelli, A. Parziale, Diagnosing Alzheimer’s disease from online handwriting: A novel dataset and performance benchmarking, Engineering Applications of Artificial Intelligence, Vol. 111 (20229) 104822. https://doi.org/10.1016/j.engappai.2022.104822


In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import sys
import matplotlib

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


In [None]:
Data=pd.read_csv(r'Data\data.csv')
Data


# I. EDA #

In [None]:
Data.info()
Data.describe()


<ul><li>columns : 'ID' and 'Class' are Categorical variables. </li>
<li>Huge difference in features's number scales .</li>
<li>Many features have big std: noisy data.</li>
</ul>


In [None]:
Data['class'].value_counts()


P: positive to diagnosis <br>
H: Healthy


In [None]:

sns.pairplot(Data, hue=None,  palette='bright', kind='scatter', diag_kind='auto',
             markers='^', height=2, aspect=1, dropna=False, size=None)


In [None]:
plt.figure(figsize=(40,40))
#cmap=sns.light_palette((210,90,60), input='husl')
sns.heatmap(Data.corr(), annot=True, fmt='.2f') #, cmap=cmap
plt.show()
plt.savefig(r"Heatmap.png")


# II. Pre Processing #

In [None]:
from sklearn import preprocessing

LB=preprocessing.LabelBinarizer()
Target=LB.fit_transform(Data['class'])
Target


In [None]:
#not active
from sklearn.preprocessing import MinMaxScaler

Xm = MinMaxScaler().fit_transform(Data2)
X=pd.DataFrame(Xm,columns=col_)
X.describe()


In [None]:
from sklearn.preprocessing import StandardScaler

Data2= Data.drop(['ID', 'class'],axis=1)
Xb = StandardScaler().fit_transform(Data2)
Xb


In [None]:
col_= list(Data2.columns)
X=pd.DataFrame(Xb,columns=col_)
X.describe()


In [None]:
Y=Target
Diag=pd.DataFrame(Y, columns=['Diagnostic'])
X


In [None]:
ke=col_+['Diagnostic']
Data3=pd.concat([Data2,Diag],axis=1, join='outer',ignore_index=False,keys=ke)
Data3


In [None]:
plt.figure(figsize=(40,40))
#cmap=sns.light_palette((210,90,60), input='husl')
sns.heatmap(Data3.corr(), annot=True, fmt='.2f') #, cmap=cmap
plt.show()
plt.savefig(r'Heatmap.png')


### PCA ###

In [None]:
from sklearn.decomposition import PCA

pca=PCA(n_components=4)
pca.fit(Data3)


In [None]:
pca.components_, pca.explained_variance_, pca.explained_variance_ratio_


In [None]:
PCA_data = pca.transform(Data3)
PC1 = PCA_data[:,0]
PC2 = PCA_data[:,1]
T =Target
label = Data3.columns

labels=[]
 
for point in T:
    labels.append(label[point])
 
zipped = list(zip(PC1,PC2, T,labels))
 
pc_df = pd.DataFrame(zipped, columns=['PC1','PC2','Target','Label'])
pc_df


In [None]:
plt.figure(figsize=(12,7))
 
sns.scatterplot(x=PC1,y=PC2, hue ='Label')
plt.title("PCA Alzheimer",fontsize=16)
plt.xlabel('First Principal Component',fontsize=12)
plt.ylabel('Second Principal Component',fontsize=12)
#plt.xlim(-3,0)

plt.show()


First component explains 91% of variance 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.4)


In [None]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score

def print_score(clf, X_train, X_test, Y_train, Y_test, train=True):
    "Print the accuracy score, Classification report and confusion matrix"
    LB=preprocessing.LabelBinarizer()
    LB.fit(Y_train)
    if train:
        "Train performance"
        res=clf.predict(X_train)
        
        print(f'Train Results:\n')
        print(f'Accuracy Score: %.4f \n' % (accuracy_score(Y_train,res)))
        print(f'Classification report:\n {classification_report(Y_train,res)} ')
        print(f'Confusion Matrix Score: {confusion_matrix(Y_train,res)}')
        print(f'ROC AUC Score:%.4f'%(roc_auc_score(LB.transform(Y_train),LB.transform(res))))
        
        res=cross_val_score(clf,X_train, Y_train, cv=12, scoring='accuracy')
        print('cross_val_score Accuracy \t: %.4f'%(np.mean(res)))
        print('cross_val_score SD \t: %.4f '% (np.std(res)))
                      
    elif train==False:
        "Test performance"
        res_test=clf.predict(X_test)
        print(f'Test Results:\n')
        print('Accuracy Score: %.4f \n'%(accuracy_score(Y_test,res_test)))
        print(f'Classification report:\n {classification_report(Y_test,res_test)}')
        print(f'Confusion Matrix Score: {confusion_matrix(Y_test,res_test)}')
        print('ROC AUC Score: %.4f\n'%(roc_auc_score(LB.transform(Y_test),LB.transform(res_test))))
        print('Average Accuracy \t: %.4f'%(np.mean(res_test)))
        print('Accuracy SD \t: %.4f '% (np.std(res_test)))


# III. Fit Model #

## 1) Decision Tree ##

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT_clf=DecisionTreeClassifier(random_state=25)
DT_clf.fit(X_train, Y_train.ravel())


In [None]:

print(f' {print_score(DT_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(DT_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### Interpretation : ###
<br>
Random Forest model is slightly better

## 2) Random Forest ##

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_clf=RandomForestClassifier(random_state=22, n_estimators=50)
RF_clf.fit(X_train, Y_train.ravel())


In [None]:

print(f' {print_score(RF_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(RF_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### Interpretation : ###
<br>
- <b>Accuracy Score: accuracy of model </b> pretty high <br>
- <b>Precision: error in predictions </b>pretty high<br>
- <b>Recall: true positive rate, fraction of True cases </b> 87% positive cases predicted successfully, prety good score <br>
-  <b>f1 Score: model's accuracy, fraction of True predictions </b> 87% which is descent <br>
-  <b>Confusion Matrix: a table of all actual values versus the predicted ones of a classifier</b> False positive cases are 4, which is critical for a decease prediction model <br>
-  <b>ROC AUC Score: measure of separability, the performance in distingishing the + and - classes </b> 88% good but not enouig in this case<br>
-  <b>Cross Validation Accuracy: k fold cross validation of the entire data set  </b>88% low for a decease prediction model <br>


## 3) SVM ##

In [None]:
from sklearn import svm

C=1.0
SVM_clf=svm.SVC(kernel='linear', C=C)
SVM_clf.fit(X_train, Y_train)


In [None]:

print(f' {print_score(SVM_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(SVM_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### Interpretation : ###
<br>
- <b>Accuracy Score: accuracy of model </b> same <br>
- <b>Precision: error in predictions </b> worse <br>
- <b>Recall: true positive rate, fraction of True cases </b> same <br>
-  <b>f1 Score: model's accuracy, fraction of True predictions </b> worse  <br>
-  <b>Confusion Matrix: a table of all actual values versus the predicted ones of a classifier</b>  same<br>
-  <b>ROC AUC Score: measure of separability, the performance in distingishing the + and - classes </b> same<br>
-  <b>Cross Validation Accuracy: k fold cross validation of the entire data set  </b>  same<br>


# IV. Ensemble ML with Random Forest #

## 1) Bagging (oob_score=False): ##

In [None]:
from sklearn.ensemble import BaggingClassifier

bag_clf=BaggingClassifier(estimator=rf_clf, n_estimators=100, bootstrap=True, 
                          oob_score=False, n_jobs=-1, random_state=25)
bag_clf.fit(X_train, Y_train.ravel())


In [None]:

print(f' {print_score(bag_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(bag_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### Interpretation : ###
<br>
- <b>Accuracy Score: accuracy of model </b> better <br>
- <b>Precision: error in predictions </b> 80% for positive <br>
- <b>Recall: true positive rate, fraction of True cases </b> 97% very high <br>
-  <b>f1 Score: model's accuracy, fraction of True predictions </b> 88%  <br>
-  <b>Confusion Matrix: a table of all actual values versus the predicted ones of a classifier</b>  1 FP which is much better<br>
-  <b>ROC AUC Score: measure of separability, the performance in distingishing the + and - classes </b> 87%<br>
-  <b>Cross Validation Accuracy: k fold cross validation of the entire data set  </b>  87%<br>


## 2) AdaBoost : ##

In [None]:
from sklearn.ensemble import AdaBoostClassifier

Ada_RF=AdaBoostClassifier(RandomForestClassifier(n_estimators=100), n_estimators=100)
Ada_RF.fit(X_train, Y_train.ravel())


In [None]:

print(f' {print_score(Ada_RF, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(Ada_RF, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### Interpretation : ###
<br>
- <b>Accuracy Score: accuracy of model </b> 85% <br>
- <b>Precision: error in predictions </b> 77% for positive <br>
- <b>Recall: true positive rate, fraction of True cases </b> 100% very high <br>
-  <b>f1 Score: model's accuracy, fraction of True predictions </b> 87%  <br>
-  <b>Confusion Matrix: a table of all actual values versus the predicted ones of a classifier</b>  0 FP which is perfect for this case <br>
-  <b>ROC AUC Score: measure of separability, the performance in distingishing the + and - classes </b> 86%<br>
-  <b>Cross Validation Accuracy: k fold cross validation of the entire data set  </b>  88%<br>

### The best model so far ###

## 3) GBM : ##

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbm_clf=GradientBoostingClassifier(n_estimators=50)
gbm_clf.fit(X_train, Y_train.ravel())


In [None]:

print(f' {print_score(gbm_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(gbm_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


6 FP, completely useless

## 3) XGB: ##

In [None]:
import xgboost as xgb

xgb_clf=xgb.XGBClassifier(max_depth=5, n_estimators=100, learning_rate=0.3, n_jobs=-1)
xgb_clf.fit(X_train, Y_train.ravel())


In [None]:

print(f' {print_score(xgb_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(xgb_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


5 FP, same score for initial models
