In [1]:
import pandas as pd 
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib widget

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import common

# Loading Datasets
data = common.loadFile("CleanedData").drop(["RID", "VISCODE"], axis=1)
                                           
#Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#Analysis
from sklearn.decomposition import FastICA

# Testing
import warnings

# Component per Classification

The goal here is to know how many components we should use. 

In [2]:
plt.close()
ax = sns.countplot(x=data['DX'])
plt.title("Data Diagnosis")
plt.ylabel("Count")
plt.xlabel("Diagnosis")
plt.xticks(ticks=range(0,3), labels=['CN', 'MCI', 'Dementia'])

for p in ax.patches:
        ax.annotate(f'\n{p.get_height()}', (p.get_x()+p.get_width()/2, p.get_height()), ha='center', 
                    va='top', color='white', size=18)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## ICA

First a function to calculate ICA. This function is also used in the file PCA.ipynb

In [3]:
def calcICA(numberComponents: int) -> pd.DataFrame:
    # Creating ICA object with max interactions, the default is 200
    ICA = FastICA(n_components=numberComponents, max_iter=1000)
    IndependentComponentValues=ICA.fit_transform(data.drop(["DX"], axis=1), data.loc[:,['DX']].values)

    def getColumnNames() -> list:
        res = []
        for i in range(1, len(IndependentComponentValues.T)+1):
            res.append('IC'+str(i))
        return res

    del ICA
    #Creating the dataframe
    return pd.DataFrame(data=IndependentComponentValues, columns=getColumnNames())



In [4]:
def classifierBattery(ICA: pd.DataFrame) -> dict:
    classifier = {}
    
    X = ICA.to_numpy().astype('float')
    y = data.loc[:,['DX']].to_numpy().astype('float').flatten()
    
    xtrain, xtest, ytrain, ytest= train_test_split(X, y, random_state=0)
    X_train = xtrain
    X_test = xtest
    y_train = ytrain
    y_test = ytest

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(xtrain)
    X_test = scaler.transform(xtest)
    
    # Decision Tree
    clf = DecisionTreeClassifier().fit(X_train, y_train)
    classifier["DecisionTree"] = {"Train" : clf.score(X_train, y_train), "Test" : clf.score(X_test, y_test)}
    
    # K-Nearest Neighbors                      
    knn = KNeighborsClassifier().fit(X_train, y_train)
    classifier["KNN"] = {"Train" : knn.score(X_train, y_train), "Test" : knn.score(X_test, y_test)}
    
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis().fit(X_train, y_train)
    classifier["LDA"] = {"Train" : lda.score(X_train, y_train), "Test" : lda.score(X_test, y_test)}
    
    # Gaussian Naive Bayes
    gnb = GaussianNB().fit(X_train, y_train)
    classifier["GNB"] = {"Train" : gnb.score(X_train, y_train), "Test" : gnb.score(X_test, y_test)}
    
    #Support Vector Machine
    svm = SVC(kernel='rbf').fit(X_train, y_train)
    classifier["SVM"] = {"Train" : svm.score(X_train, y_train), "Test" : svm.score(X_test, y_test)}
    
    return classifier

Here we go!

In [5]:
classifiers = pd.DataFrame(columns=["Number Components", "Algorithm", "Value", "Phase"])
counter = 0
size = 18 #len(data.T)
for i in range(size):
    try:
        with warnings.catch_warnings(record=True):
            warnings.simplefilter("error")
            ica = calcICA(i+1)
            res = (classifierBattery(ica))
    except:
        res = ({"DecisionTree": {"Train" : 0, "Test": 0}, "KNN":{"Train" : 0, "Test": 0}, "LDA":{"Train" : 0, "Test": 0}, "GNB":{"Train" : 0, "Test": 0}, "SVM":{"Train" : 0, "Test": 0}})
    
    for x in res:
        classifiers = classifiers.append({"Number Components" : i+1, "Algorithm" : x, "Value" : res[x]["Train"], "Phase" : "Train"}, ignore_index=True)
        classifiers = classifiers.append({"Number Components" : i+1, "Algorithm" : x, "Value" : res[x]["Test"], "Phase" : "Test"}, ignore_index=True)
    
        
    if i % math.ceil(size*0.1) == 0:
        print(counter*10, '%')
        counter += 1



0 %
10 %
20 %
30 %
40 %
50 %
60 %
70 %
80 %


In [6]:
plt.tight_layout()
sns.relplot(x="Number Components", y="Value", hue="Phase", kind="line", data=classifiers.loc[classifiers["Algorithm"] == "DecisionTree"], height=9, aspect=1.5).set(title="Decision Tree").tight_layout(w_pad=5)
sns.relplot(x="Number Components", y="Value", hue="Phase", kind="line", data=classifiers.loc[classifiers["Algorithm"] == "KNN"], height=9, aspect=1.5).set(title="K-Nearest Neighbors").tight_layout(w_pad=5)
sns.relplot(x="Number Components", y="Value", hue="Phase", kind="line", data=classifiers.loc[classifiers["Algorithm"] == "LDA"], height=9, aspect=1.5).set(title="Linear Discriminant Analysis").tight_layout(w_pad=5)
sns.relplot(x="Number Components", y="Value", hue="Phase", kind="line", data=classifiers.loc[classifiers["Algorithm"] == "GNB"], height=9, aspect=1.5).set(title="Gaussian Naive Bayes").tight_layout(w_pad=5)
sns.relplot(x="Number Components", y="Value", hue="Phase", kind="line", data=classifiers.loc[classifiers["Algorithm"] == "SVM"], height=9, aspect=1.5).set(title="Support Vector Machine").tight_layout(w_pad=5)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [7]:
bestTest = pd.concat([pd.DataFrame(y.loc[y.loc[y["Phase"] == "Test"]['Value'].idxmax()]).T for y in [classifiers.loc[classifiers["Algorithm"] == x] for x in classifiers["Algorithm"].unique()]])

In [8]:
best = bestTest.append(pd.concat([classifiers.loc[(classifiers["Phase"] == "Train") & (classifiers["Algorithm"] == x) & (classifiers["Number Components"] == y)] for x,y in zip(bestTest["Algorithm"], bestTest["Number Components"])])).sort_values("Algorithm")

In [9]:

plt.figure()
sns.barplot(x="Algorithm", y="Number Components", hue="Phase", data=best)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:xlabel='Algorithm', ylabel='Number Components'>