In [1]:
Training = 0.3 # Using 30% of the data to train

In [2]:
import pandas as pd 
import numpy as np
import math

# Graphs
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib widget

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import confusion_matrix

import common

# The way Pandas is being used in this file is deprecated. It will eventually stop working. The next lines hide all warnings:

import warnings
warnings.filterwarnings("ignore")

# Loading Datasets
data = common.loadFile("CleanedData").drop(["RID", "VISCODE", "PTEDUCAT", "PTGENDER", "AGE"], axis=1)
reconstructedData = common.loadFile("filledData").drop(["PTEDUCAT", "PTGENDER", "AGE"], axis=1)

In [3]:
algorithms = [DecisionTreeClassifier, KNeighborsClassifier, LinearDiscriminantAnalysis, GaussianNB, SVC, LinearSVC, RandomForestClassifier]
labels = ["Decision Tree", "K-Nearest Neighbors", "Linear Discriminant Analysis", "Guassian Naïve Bayes", "Support Vector Classification", "Linear SVC", "Random Forest"]

In [4]:
# Optimizing testing for every model
def getModelScore(model, xTrain, xTest, yTrain, yTest):
    model.fit(xTrain, yTrain)
    return model.score(xTest, yTest)

In [5]:
X = normalize(data.drop("DX", axis=1).to_numpy().astype('float'))
RX = normalize(reconstructedData.drop(["DX"], axis=1).to_numpy().astype('float'))

y = data.loc[:,['DX']].to_numpy().astype('float').flatten()
Ry = reconstructedData.loc[:,['DX']].to_numpy().astype('float').flatten()

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=Training)
Rxtrain, Rxtest, Rytrain, Rytest = train_test_split(RX, Ry, test_size=Training)

In [6]:
simpleScore = [getModelScore(i(), xtrain, xtest, ytrain, ytest) * 100 for i in algorithms]
crossValidationScore = [cross_val_score(i(), X, y, cv=10) for i in algorithms]
RSimpleScore = [getModelScore(i(), Rxtrain, Rxtest, Rytrain, Rytest) * 100 for i in algorithms]
RCrossValidationScore = [cross_val_score(i(), RX, Ry, cv=10) for i in algorithms]

In [7]:
plt.figure(figsize=[18, 10])

df = pd.DataFrame(columns = ['Algorithm', 'Method', 'Score'])

for i, n in zip(labels, range(len(labels))):
    df = df.append({"Algorithm" : i, "Method" : "Simple Validation", "Score": round(simpleScore[n], 2)}, ignore_index=True)
    df = df.append({"Algorithm" : i, "Method" : "Cross Validation", "Score": round(sum(crossValidationScore[n])/len(crossValidationScore[n]) *100, 2)}, ignore_index=True)
    
ax = sns.barplot(x="Algorithm", y="Score", hue="Method", data=df)
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f%%")

ax.set(xlabel='Algorithms', ylabel='Scores (%)', title=f"Clean Data Score with {int(Training * 100)}% of Training")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
plt.figure(figsize=[18, 10])

df = pd.DataFrame(columns = ['Algorithm', 'Method', 'Score'])

for i, n in zip(labels, range(len(labels))):
    df = df.append({"Algorithm" : i, "Method" : "Simple Validation", "Score": round(RSimpleScore[n], 2)}, ignore_index=True)
    df = df.append({"Algorithm" : i, "Method" : "Cross Validation", "Score": round(sum(RCrossValidationScore[n])/len(RCrossValidationScore[n]) *100, 2)}, ignore_index=True)
    
ax = sns.barplot(x="Algorithm", y="Score", hue="Method", data=df)
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f%%")

ax.set(xlabel='Algorithms', ylabel='Scores (%)', title=f"Filled Data Score with {int(Training * 100)}% of Training")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Validating clean data with reconstructed data

In [9]:
# Train data is from the complete dataset.
mixedXtrain = normalize(data.drop("DX", axis=1).to_numpy().astype('float'))
mixedYtrain = data.loc[:,['DX']].to_numpy().astype('float').flatten()

# Testing will be done with the recustructed data
mixedXtest = normalize(reconstructedData.drop(["DX",'numFilledLabels'], axis=1).to_numpy().astype('float'))
mixedYtest = reconstructedData.loc[:,['DX']].to_numpy().astype('float').flatten()

In [10]:
mixedSimpleScore = [getModelScore(i(), mixedXtrain, mixedXtest, mixedYtrain, mixedYtest) * 100 for i in algorithms]

In [11]:
plt.figure(figsize=[18, 8])
df = pd.DataFrame(columns = ['Algorithm', 'Method', 'Score'])

for i, n in zip(labels, range(len(labels))):
    df = df.append({"Algorithm" : i, "Method" : "Reconstructed Validation", "Score": round(mixedSimpleScore[n], 2)}, ignore_index=True)
    df = df.append({"Algorithm" : i, "Method" : "Complete Data Validation", "Score": round(simpleScore[n], 2)}, ignore_index=True)
    df = df.append({"Algorithm" : i, "Method" : "Complete Data Cross Validation", "Score": round(sum(crossValidationScore[n])/len(crossValidationScore[n]) *100, 2)}, ignore_index=True)
    #df = df.append({"Algorithm" : i, "Method" : "Cross Validation", "Score": round(sum(crossValidationScore[n])/len(crossValidationScore[n]) *100, 2)}, ignore_index=True)


ax = sns.barplot(x="Algorithm", y="Score", hue="Method", data=df, edgecolor='white', linewidth=1)
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f%%", size=9)
    
ax.legend(loc='upper left', fontsize=9, bbox_to_anchor=(0, 1.1))

ax.set(xlabel='Algorithms', ylabel='Scores (%)', title=f"Clean Data Score with {int(Training * 100)}% of Training", ylim=(0,100))
ax.yaxis.set_major_locator(ticker.MultipleLocator(10))
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
performanceData = reconstructedData.sort_values(by='numFilledLabels')

In [13]:
performanceResults = {}
for i in range(int(performanceData['numFilledLabels'].max()) + 1):
    testingSet = performanceData.loc[performanceData.numFilledLabels == i]
    if len(testingSet) == 0: 
        continue
    else:
        testingX = normalize(testingSet.drop(["DX", "numFilledLabels"], axis=1).to_numpy().astype("float"))
        testingY = testingSet.loc[:, ["DX"]].to_numpy().astype("float").flatten()

        # Notice: The training set was defined before the previous graph
        newIteraction = [getModelScore(j(), mixedXtrain, testingX, mixedYtrain, testingY) * 100 for j in algorithms]
        newIteraction.append(int(len(testingSet)))
        performanceResults[i] = newIteraction

In [14]:
performanceLabels = labels.copy()
performanceLabels.insert(0, "Index")
performanceLabels.append("Data Set Size")

In [15]:
resultDF = pd.DataFrame(columns=performanceLabels)

In [16]:
for i in performanceResults:
    if performanceResults[i] == np.nan: continue
    row = [i]
    for j in performanceResults[i]:
        row.append(j)
    
    resultDF = resultDF.append(dict(zip(performanceLabels, row)), ignore_index=True)
    

In [17]:
plt.figure(figsize=[18, 8])
ax = sns.scatterplot(x='Index', y='value', hue='variable', data=pd.melt(resultDF.drop("Data Set Size", axis=1), ['Index']))
ax.set(xlabel='Number of Missing Labels', ylabel='Scores (%)', title="Rebuild Performance")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [18]:
plt.figure(figsize=[18, 8])
ax = sns.barplot(x='Index', y='Data Set Size', data=resultDF)
ax.set(xlabel='Number of Missing Labels', ylabel='Size of Dataset', title="Dataset Rebuild Size")
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True) )
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [19]:
incrementalPerformanceResults = {}
for i in range(int(performanceData['numFilledLabels'].max()) + 1):
    testingSet = performanceData.loc[performanceData.numFilledLabels >= i]
    if len(testingSet) == 0: 
        continue
    else:
        testingX = normalize(testingSet.drop(["DX", "numFilledLabels"], axis=1).to_numpy().astype("float"))
        testingY = testingSet.loc[:, ["DX"]].to_numpy().astype("float").flatten()

        # Notice: The training set was defined before the previous graph
        newIteraction = [getModelScore(j(), mixedXtrain, testingX, mixedYtrain, testingY) * 100 for j in algorithms]
        newIteraction.append(int(len(testingSet)))
        incrementalPerformanceResults[i] = newIteraction

In [20]:
incrementalResultDF = pd.DataFrame(columns=performanceLabels)

In [21]:
for i in incrementalPerformanceResults:
    if incrementalPerformanceResults[i] == np.nan: continue
    row = [i]
    for j in incrementalPerformanceResults[i]:
        row.append(j)
    
    incrementalResultDF = incrementalResultDF.append(dict(zip(performanceLabels, row)), ignore_index=True)
    

In [22]:
plt.figure(figsize=[18, 8])
ax = sns.scatterplot(x='Index', y='value', hue='variable', data=pd.melt(incrementalResultDF.drop("Data Set Size", axis=1), ['Index']))
ax.set(xlabel='Number of Missing Labels (>n)', ylabel='Scores (%)', title="Incremental Rebuild Performance")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
plt.figure(figsize=[18, 8])
ax = sns.barplot(x='Index', y='Data Set Size', data=incrementalResultDF)
ax.set(xlabel='Number of Missing Labels (>n)', ylabel='Size of Dataset', title="Dataset Incremental Rebuild Size")
for container in ax.containers:
    ax.bar_label(container)
    
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True) )
    
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [47]:
# Setup the labels
dxLabels = ['CN', 'MCI', 'Dementia']

# Create models
models = [x().fit(mixedXtrain, mixedYtrain) for x in algorithms]
    
graphs = []

for i in range(40):
    testingSet = performanceData.loc[performanceData.numFilledLabels >= i]
    currentTest = testingSet.drop(["DX", "numFilledLabels"], axis=1)
    interaction = {}
    for j in range(len(models)):
        cm = confusion_matrix(testingSet.DX, models[j].predict(currentTest), labels=models[j].classes_)
        df = pd.DataFrame(cm, dxLabels, dxLabels)
        interaction[labels[j]] = df
    interaction["size"] = len(testingSet)
    graphs.append(interaction)
    

In [35]:
len(graphs)

40

In [59]:
# 0 - "Decision Tree", 1 - "K-Nearest Neighbors", 2 - "Linear Discriminant Analysis", 3 - "Guassian Naïve Bayes", 4 - "Support Vector Classification", 5 - "Linear SVC", 6 - "Random Forest"
index = 0 # Insert the number of missing values (0 to 39)
algorithm = 2 # Insert the algorithm you whish to plot

plt.figure(figsize=(8,5))

ax = sns.heatmap(graphs[index][labels[algorithm]], annot=True, cmap=plt.cm.Blues, fmt='g')
size = graphs[index]["size"]
ax.set(xlabel='Predicted Labels', ylabel='True Labels', title=f"{labels[algorithm]} Algorithm for dataset >{index} with size {size}")


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[Text(0.5, 25.722222222222214, 'Predicted Labels'),
 Text(70.72222222222221, 0.5, 'True Labels'),
 Text(0.5, 1.0, 'Linear Discriminant Analysis Algorithm for dataset >0 with size 5025')]

In [50]:
graphs[index]["size"]

5025