# Classification

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib widget

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import common

In [2]:
data = common.loadFile("CleanedData").drop(["RID", "VISCODE"], axis=1)

In [3]:
plt.close()
ax = sns.countplot(x=data['DX'])
plt.title("Data Diagnosis")
plt.ylabel("Count")
plt.xlabel("Diagnosis")
plt.xticks(ticks=range(0,3), labels=['CN', 'MCI', 'Dementia'])

for p in ax.patches:
        ax.annotate(f'\n{p.get_height()}', (p.get_x()+p.get_width()/2, p.get_height()), ha='center', 
                    va='top', color='white', size=18)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Create Training and Test Sets and Apply Scaling

In [4]:
data = [data, common.loadFile("PCAData"), common.loadFile("ICAData")]
label = ["Full Data", "PCA Data", "ICA Data"]

In [5]:
X = []
y = []
X_train = []
X_test = []
y_train = []
y_test = []
scaler = []

for i in range(len(data)):
    X.append(data[i].drop("DX", axis=1).to_numpy().astype('int'))
    y.append(data[i].loc[:,['DX']].to_numpy().astype('int').flatten())
    

    xtrain, xtest, ytrain, ytest= train_test_split(X[i], y[i], random_state=0)
    X_train.append(xtrain)
    X_test.append(xtest)
    y_train.append(ytrain)
    y_test.append(ytest)
    
    scaler.append(MinMaxScaler())
    X_train.append(scaler[i].fit_transform(xtrain))
    X_test.append(scaler[i].transform(xtest))

## Models

### Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier
for i in range(len(data)):
    print(f"Accuracy of Decision Tree classifier for {label[i]}")
    clf = DecisionTreeClassifier().fit(X_train[i], y_train[i])
    print('Training set: {:.2f}'
         .format(clf.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(clf.score(X_test[i], y_test[i])))

Accuracy of Decision Tree classifier for Full Data
Training set: 1.00
Test set: 0.77
Accuracy of Decision Tree classifier for PCA Data
Training set: 1.00
Test set: 0.78
Accuracy of Decision Tree classifier for ICA Data
Training set: 0.98
Test set: 0.72


### K-Nearest Neighbors

In [7]:
from sklearn.neighbors import KNeighborsClassifier

for i in range(len(data)):
    knn = KNeighborsClassifier().fit(X_train[i], y_train[i])

    print(f"Accuracy of K-NN classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(knn.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(knn.score(X_test[i], y_test[i])))

Accuracy of K-NN classifier for Full Data
Training set: 0.83
Test set: 0.76
Accuracy of K-NN classifier for PCA Data
Training set: 0.77
Test set: 0.64
Accuracy of K-NN classifier for ICA Data
Training set: 0.85
Test set: 0.78


### Linear Discriminant Analysis

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
for i in range(len(data)):
    lda = LinearDiscriminantAnalysis().fit(X_train[i], y_train[i])
    print(f"Accuracy of LDA classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(lda.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(lda.score(X_test[i], y_test[i])))

Accuracy of LDA classifier for Full Data
Training set: 0.87
Test set: 0.82
Accuracy of LDA classifier for PCA Data
Training set: 0.87
Test set: 0.82
Accuracy of LDA classifier for ICA Data
Training set: 0.79
Test set: 0.78


### Gaussian Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

for i in range(len(data)):
    gnb = GaussianNB().fit(X_train[i], y_train[i])
    
    print(f"Accuracy of GNB classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(gnb.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(gnb.score(X_test[i], y_test[i])))

Accuracy of GNB classifier for Full Data
Training set: 0.62
Test set: 0.58
Accuracy of GNB classifier for PCA Data
Training set: 0.59
Test set: 0.56
Accuracy of GNB classifier for ICA Data
Training set: 0.68
Test set: 0.64


### Support Vector Machine

In [10]:
from sklearn.svm import SVC
for i in range(len(data)):
    svm = SVC(kernel='rbf').fit(X_train[i], y_train[i])

    print(f"Accuracy of SVM classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(svm.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(svm.score(X_test[i], y_test[i])))

Accuracy of SVM classifier for Full Data
Training set: 0.80
Test set: 0.77
Accuracy of SVM classifier for PCA Data
Training set: 0.84
Test set: 0.78
Accuracy of SVM classifier for ICA Data
Training set: 0.84
Test set: 0.82
