In [1]:
Training = 0.3 # Using 30% of the data to train

In [2]:
import pandas as pd 
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib widget

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC, SVR, LinearSVR
from sklearn.ensemble import RandomForestClassifier

import common

# Loading Datasets
data = common.loadFile("CleanedData").drop(["RID", "VISCODE"], axis=1)
reconstructedData = common.loadFile("filledData")

In [3]:
algorithms = [DecisionTreeClassifier, KNeighborsClassifier, LinearDiscriminantAnalysis, GaussianNB, SVC, LinearSVC, SVR, LinearSVR, RandomForestClassifier]
labels = ["Decision Tree", "K-Nearest Neighbors", "Linear Discriminant Analysis", "Guassian Naïve Bayes", "Support Vector Classification", "Linear SVC", "Linear Support Vector Regression", "Linear SVR", "Random Forest"]

In [4]:
# Optimizing testing for every model
def getModelScore(model, xTrain, xTest, yTrain, yTest):
    model.fit(xTrain, yTrain)
    return model.score(xTest, yTest)

In [5]:
X = normalize(data.drop("DX", axis=1).to_numpy().astype('float'))
RX = normalize(reconstructedData.drop("DX", axis=1).to_numpy().astype('float'))

y = data.loc[:,['DX']].to_numpy().astype('float').flatten()
Ry = reconstructedData.loc[:,['DX']].to_numpy().astype('float').flatten()

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=Training)
Rxtrain, Rxtest, Rytrain, Rytest = train_test_split(RX, Ry, test_size=Training)

In [6]:
simpleScore = [getModelScore(i(), xtrain, xtest, ytrain, ytest) * 100 for i in algorithms]
crossValidationScore = [cross_val_score(i(), X, y, cv=10) for i in algorithms]
RSimpleScore = [getModelScore(i(), Rxtrain, Rxtest, Rytrain, Rytest) * 100 for i in algorithms]
RCrossValidationScore = [cross_val_score(i(), RX, Ry, cv=10) for i in algorithms]

In [10]:
plt.figure(figsize=[18, 10])

df = pd.DataFrame(columns = ['Algorithm', 'Method', 'Score'])

for i, n in zip(labels, range(len(labels))):
    df = df.append({"Algorithm" : i, "Method" : "Simple Validation", "Score": round(simpleScore[n], 2)}, ignore_index=True)
    df = df.append({"Algorithm" : i, "Method" : "Cross Validation", "Score": round(sum(RCrossValidationScore[n])/len(RCrossValidationScore[n]) *100, 2)}, ignore_index=True)
    
ax = sns.barplot(x="Algorithm", y="Score", hue="Method", data=df)
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f%%")

ax.set(xlabel='Algorithms', ylabel='Scores (%)', title=f"Clean Data Score with {int(Training * 100)}% of Training")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:
plt.figure(figsize=[18, 10])

df = pd.DataFrame(columns = ['Algorithm', 'Method', 'Score'])

for i, n in zip(labels, range(len(labels))):
    df = df.append({"Algorithm" : i, "Method" : "Simple Validation", "Score": round(RSimpleScore[n], 2)}, ignore_index=True)
    df = df.append({"Algorithm" : i, "Method" : "Cross Validation", "Score": round(sum(RCrossValidationScore[n])/len(RCrossValidationScore[n]) *100, 2)}, ignore_index=True)
    
ax = sns.barplot(x="Algorithm", y="Score", hue="Method", data=df)
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f%%")

ax.set(xlabel='Algorithms', ylabel='Scores (%)', title=f"Filled Data Score with {int(Training * 100)}% of Training")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …