# ICA Classification - Reconstructed Data

In [27]:
import pandas as pd
import numpy as np

# Graphs
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import FastICA

# Preprocessing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC, SVR, LinearSVR
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import confusion_matrix

import common


# The way Pandas is being used in this file is deprecated. It will eventually stop working. The next lines hide all warnings:
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = common.loadFile("CleanedData").drop(["VISCODE", "RID"], axis=1).dropna()
reconstructedData = common.loadFile("filledData")

In [3]:
plt.figure(figsize=[5, 5])
ax = sns.countplot(x=data['DX'])
plt.title("Data Diagnosis")
plt.ylabel("Count")
plt.xlabel("Diagnosis")
plt.xticks(ticks=range(0,3), labels=['CN', 'MCI', 'Dementia'])

for p in ax.patches:
        ax.annotate(f'\n{p.get_height()}', (p.get_x()+p.get_width()/2, p.get_height()), ha='center', 
                    va='top', color='white', size=18)
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [4]:
# Data
DataX = data.drop("DX", axis=1).to_numpy().astype('float')
RDataX = reconstructedData.drop(["DX", "numFilledLabels"], axis=1).to_numpy().astype('float')

In [5]:
def getComponentNames(components: np.ndarray) -> list:
    res = []
    for i in range(1, len(components.T)+1):
        res.append('IC'+str(i))
    return res

In [6]:
components = 10
Training = 0.3

# Creating ICA object
ICA = FastICA(n_components=components, random_state=0) # Transform

X_transformed = ICA.fit_transform(DataX)
XR_transformed = ICA.transform(RDataX)

Reduced_Data = pd.DataFrame(data=X_transformed, columns=getComponentNames(X_transformed))
Reduced_Reconstructed_Data = pd.DataFrame(data=XR_transformed, columns=getComponentNames(XR_transformed))

# Rebuild dataset
ICA_Data = pd.concat([Reduced_Data, data[['DX']]], axis = 1)
ICA_Data_Reconstructed = pd.concat([Reduced_Reconstructed_Data, reconstructedData[['DX']]], axis = 1)

In [44]:
algorithms = [DecisionTreeClassifier, KNeighborsClassifier, LinearDiscriminantAnalysis, GaussianNB, SVC, LinearSVC, RandomForestClassifier]
labels = ["Decision Tree", "K-Nearest Neighbors", "Linear Discriminant Analysis", "Guassian Naïve Bayes", "Support Vector Classification", "Linear SVC", "Random Forest"]

In [8]:
# Optimizing testing for every model
def getModelScore(model, xTrain, xTest, yTrain, yTest):
    model.fit(xTrain, yTrain)
    return model.score(xTest, yTest)

In [9]:
X = normalize(ICA_Data.drop("DX", axis=1).to_numpy().astype('float'))
y = ICA_Data.loc[:,['DX']].to_numpy().astype('float').flatten()
XR = normalize(ICA_Data_Reconstructed.drop("DX", axis=1).to_numpy().astype('float'))
yR = ICA_Data_Reconstructed.loc[:,['DX']].to_numpy().astype('float').flatten()

In [10]:
simpleScore = [getModelScore(i(), X, XR, y, yR) * 100 for i in algorithms]

In [11]:
plt.figure(figsize=[18, 10])

df = pd.DataFrame(columns = ['Algorithm', 'Method', 'Score'])

for i, n in zip(labels, range(len(labels))):
    df = df.append({"Algorithm" : i, "Method" : "Simple Validation", "Score": round(simpleScore[n], 2)}, ignore_index=True)
    
ax = sns.barplot(x="Algorithm", y="Score", data=df)
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f%%")

ax.set(xlabel='Algorithms', ylabel='Scores (%)', title=f"Reconstructed Data on ICA complete data")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [30]:
performanceData = reconstructedData.sort_values(by='numFilledLabels')
incrementalPerformanceResults = {}
for i in range(int(performanceData['numFilledLabels'].max()) + 1):
    testingSet = performanceData.loc[performanceData.numFilledLabels >= i]
    if len(testingSet) == 0: 
        continue
    else:
        XR_transformed = ICA.transform(testingSet.drop(["DX", "numFilledLabels"], axis=1).to_numpy().astype("float"))
        
        testingX = normalize(XR_transformed)
        testingY = testingSet.loc[:, ["DX"]].to_numpy().astype("float").flatten()

        # Notice: The training set was defined before the previous graph
        newIteraction = [getModelScore(j(), X, testingX, y, testingY) * 100 for j in algorithms]
        newIteraction.append(int(len(testingSet)))
        incrementalPerformanceResults[i] = newIteraction

In [18]:
performanceLabels = labels.copy()
performanceLabels.insert(0, "Index")
performanceLabels.append("Data Set Size")

In [19]:
incrementalResultDF = pd.DataFrame(columns=performanceLabels)

In [20]:
for i in incrementalPerformanceResults:
    if incrementalPerformanceResults[i] == np.nan: continue
    row = [i]
    for j in incrementalPerformanceResults[i]:
        row.append(j)
    
    incrementalResultDF = incrementalResultDF.append(dict(zip(performanceLabels, row)), ignore_index=True)
    

In [21]:
plt.figure(figsize=[18, 8])
ax = sns.scatterplot(x='Index', y='value', hue='variable', data=pd.melt(incrementalResultDF.drop("Data Set Size", axis=1), ['Index']))
ax.set(xlabel='Number of Missing Labels (>n)', ylabel='Scores (%)', title="Incremental Rebuild Performance using ICA")
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [24]:
plt.figure(figsize=[18, 8])
ax = sns.barplot(x='Index', y='Data Set Size', data=incrementalResultDF)
ax.set(xlabel='Number of Missing Labels (>n)', ylabel='Size of Dataset', title="Dataset Incremental Rebuild Size")
for container in ax.containers:
    ax.bar_label(container)
    
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True) )
    
plt.tight_layout(pad=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [45]:
# Setup the labels
dxLabels = ['CN', 'MCI', 'Dementia']

# Create models
models = [x().fit(X, y) for x in algorithms]
    
graphs = []

for i in range(40):
    testingSet = performanceData.loc[performanceData.numFilledLabels >= i]
    currentTest = testingSet.drop(["DX", "numFilledLabels"], axis=1)
    
    XR_transformed = ICA.transform(testingSet.drop(["DX", "numFilledLabels"], axis=1).to_numpy().astype("float"))   
    testingX = normalize(XR_transformed)
    
    interaction = {}
    
    for j in range(len(models)):
        cm = confusion_matrix(testingSet.DX, models[j].predict(testingX), labels=models[j].classes_)
        df = pd.DataFrame(cm, dxLabels, dxLabels)
        interaction[labels[j]] = df
    interaction["size"] = len(testingSet)
    graphs.append(interaction)
    

In [46]:
len(graphs)

40

In [48]:
# 0 - "Decision Tree", 1 - "K-Nearest Neighbors", 2 - "Linear Discriminant Analysis", 3 - "Guassian Naïve Bayes", 4 - "Support Vector Classification", 5 - "Linear SVC", 6 - "Random Forest"
index = 39 # Insert the number of missing values (0 to 39)
algorithm = 2 # Insert the algorithm you whish to plot

plt.figure(figsize=(8,5))
ax = sns.heatmap(graphs[index][labels[algorithm]], annot=True, cmap=plt.cm.Blues, fmt='g')
size = graphs[index]["size"]
ax.set(xlabel='Predicted Labels', ylabel='True Labels', title=f"{labels[algorithm]} Algorithm for dataset >{index} with size {size}")


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[Text(0.5, 25.722222222222214, 'Predicted Labels'),
 Text(70.72222222222221, 0.5, 'True Labels'),
 Text(0.5, 1.0, 'Linear Discriminant Analysis Algorithm for dataset >39 with size 204')]