# Classification

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib widget

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
import common

In [None]:
data = common.loadFile("CleanedData").drop(["RID", "VISCODE"], axis=1)

In [None]:
plt.figure()
ax = sns.countplot(x=data['DX'])
plt.title("Data Diagnosis")
plt.ylabel("Count")
plt.xlabel("Diagnosis")
plt.xticks(ticks=range(0,3), labels=['CN', 'MCI', 'Dementia'])

for p in ax.patches:
        ax.annotate(f'\n{p.get_height()}', (p.get_x()+p.get_width()/2, p.get_height()), ha='center', 
                    va='top', color='white', size=18)

## Create Training and Test Sets and Apply Scaling

In [None]:
data = [data, common.loadFile("PCAData"), common.loadFile("ICAData")]
label = ["Full Data", "PCA Data", "ICA Data"]

In [None]:
X = []
y = []
X_train = []
X_test = []
y_train = []
y_test = []
scaler = []

for i in range(len(data)):
    X.append(data[i].drop("DX", axis=1).to_numpy().astype('float'))
    y.append(data[i].loc[:,['DX']].to_numpy().astype('float').flatten())
    

    xtrain, xtest, ytrain, ytest = train_test_split(X[i], y[i], test_size=0.3, random_state=0)
    X_train.append(xtrain)
    X_test.append(xtest)
    y_train.append(ytrain)
    y_test.append(ytest)
    
    scaler.append(MinMaxScaler())
    X_train.append(scaler[i].fit_transform(xtrain))
    X_test.append(scaler[i].transform(xtest))

## Models

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
for i in range(len(data)):
    print(f"Accuracy of Decision Tree classifier for {label[i]}")
    dt = DecisionTreeClassifier().fit(X_train[i], y_train[i])
    print('Training set: {:.2f}'
         .format(dt.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(dt.score(X_test[i], y_test[i])))

### K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

for i in range(len(data)):
    knn = KNeighborsClassifier().fit(X_train[i], y_train[i])

    print(f"Accuracy of K-NN classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(knn.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(knn.score(X_test[i], y_test[i])))

### Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
for i in range(len(data)):
    lda = LinearDiscriminantAnalysis().fit(X_train[i], y_train[i])
    print(f"Accuracy of LDA classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(lda.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(lda.score(X_test[i], y_test[i])))

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

for i in range(len(data)):
    gnb = GaussianNB().fit(X_train[i], y_train[i])
    
    print(f"Accuracy of GNB classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(gnb.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(gnb.score(X_test[i], y_test[i])))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
for i in range(len(data)):
    svm = SVC(kernel='rbf').fit(X_train[i], y_train[i])

    print(f"Accuracy of SVM classifier for {label[i]}")
    print('Training set: {:.2f}'
         .format(svm.score(X_train[i], y_train[i])))
    print('Test set: {:.2f}'
         .format(svm.score(X_test[i], y_test[i])))