In [60]:
import pandas as pd
import networkx as nx
import scipy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.metrics import f1_score, SCORERS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from utils.tools import evaluate_results_nc
from utils.data import load_PPMI_data
import data_utils as du

data_dir = du.find_data_dir('app')
dataloader_file = du.get_file_path(data_dir, 'class based structure', 'dataloaders', 'dataloader.p')
dataloader = du.read_from_pickle(dataloader_file)
adjlists, edge_metapath_indices_list, features_list, adjM, type_mask, labels, train_val_test_idx = load_PPMI_data()

train_idx = train_val_test_idx['train_idx']
train_idx = np.sort(train_idx)
test_idx = train_val_test_idx['test_idx']
test_idx = np.sort(test_idx)
val_idx = train_val_test_idx['val_idx']
val_idx = np.sort(val_idx)

names = ["Nearest Neighbors 4", "Linear SVM", "Linear SVM MAGNN",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(4),
    SVC(kernel="linear", C=0.025),
    LinearSVC(dual=False),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB()
]

X, y = dataloader.X.iloc[test_idx].values, np.array(dataloader.y.iloc[test_idx]).astype(int)

In [61]:
dataloader.target_distribution_stats()

Unnamed: 0,Amount,Percentage
-1,91,39.9%
0,70,30.7%
1,67,29.4%
Total,228,100.0%


### Functions

In [62]:
def clf_test(X, y, clf, test_sizes=(0.2, 0.4, 0.6, 0.8), repeat=10):
    random_states = [182318 + i for i in range(repeat)]
    result_macro_f1_list = []
    result_micro_f1_list = []
    for test_size in test_sizes:
        macro_f1_list = []
        micro_f1_list = []
        for i in range(repeat):
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, shuffle=True, random_state=random_states[i])
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            macro_f1 = f1_score(y_test, y_pred, average='macro')
            micro_f1 = f1_score(y_test, y_pred, average='micro')
            macro_f1_list.append(macro_f1)
            micro_f1_list.append(micro_f1)
        result_macro_f1_list.append((np.mean(macro_f1_list), np.std(macro_f1_list)))
        result_micro_f1_list.append((np.mean(micro_f1_list), np.std(micro_f1_list)))
    return result_macro_f1_list, result_micro_f1_list

def format_mean_std(mean_std):
    mean, std = mean_std
    mean_perc = mean*100
    std_perc = std*100
    return f'{mean_perc:.1f}%~{std_perc:.1f}%'

def create_confusion_matrix(y_true, y_predicted):
    classes = [-1, 0, 1]
    true_actual_str = 'True/Actual'
    predicted_str = 'Predicted'

    MultiIndex_Columns = pd.MultiIndex.from_tuples(list(zip([true_actual_str]*3, classes)))
    MultiIndex_Index = pd.MultiIndex.from_tuples(list(zip([predicted_str]*3, classes)))

    df_true_predicted = pd.DataFrame(np.zeros((3,3)), columns = MultiIndex_Columns, index=MultiIndex_Index)
    for truth, prediction in list(zip(y_true, y_predicted)):
        df_true_predicted[true_actual_str, truth][predicted_str, prediction] += 1
    df_true_predicted = df_true_predicted.astype(int)
    return df_true_predicted    

def get_classifier(target, names, classifiers):
    for name, classifier in list(zip(names, classifiers)):
        if name==target:
            return classifier

### Making random guesses

In [63]:
macro_f1_list = []
micro_f1_list = []
for i in range(1000):
    y_random_pred = np.random.randint(low=-1, high=2, size=len(y_test))
    macro_f1_list.append(f1_score(y_test, y_random_pred, average='macro'))
    micro_f1_list.append(f1_score(y_test, y_random_pred, average='micro'))
print('Macro: :', format_mean_std((np.mean(macro_f1_list), np.std(macro_f1_list))))
print('Micro: :', format_mean_std((np.mean(micro_f1_list), np.std(micro_f1_list))))

Macro: : 33.1%~7.0%
Micro: : 33.6%~6.9%


### Predicting most common class

In [64]:
y_common_class_pred = np.repeat(-1, len(y_test))
macro_common = f1_score(y_test, y_common_class_pred, average='macro')*100
micro_common = f1_score(y_test, y_common_class_pred, average='micro')*100

print('Macro: :', f'{macro_common:.1f}%')
print('Micro: :', f'{micro_common:.1f}%')

Macro: : 19.5%
Micro: : 41.3%


### Create sample confusion matrix

In [72]:
clf = get_classifier('Decision Tree', names, classifiers)

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.5, random_state=43)

print('shapes X, y', X.shape, y.shape)

clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

macro_sample = f1_score(y_test, predicted, average='macro')*100
micro_sample = f1_score(y_test, predicted, average='micro')*100
print('Macro: :', f'{macro_sample:.1f}%')
print('Micro: :', f'{micro_sample:.1f}%')

create_confusion_matrix(y_test, predicted)

shapes X, y (92, 253) (92,)
Macro: : 45.8%
Micro: : 47.8%


Unnamed: 0_level_0,Unnamed: 1_level_0,True/Actual,True/Actual,True/Actual
Unnamed: 0_level_1,Unnamed: 1_level_1,-1,0,1
Predicted,-1,12,6,3
Predicted,0,6,6,6
Predicted,1,1,2,4


### Perform statistical tests with baselines for good comparison

In [75]:
%%time
train_sizes = [0.8, 0.6, 0.4, 0.2]
df_tests_macro = pd.DataFrame([], columns=train_sizes)
df_tests_micro = pd.DataFrame([], columns=train_sizes)

for name, clf in list(zip(names, classifiers)):
    result_macro_f1_list, result_micro_f1_list = clf_test(X, y, clf)
    df_tests_macro = df_tests_macro.append(pd.Series(result_macro_f1_list, index=train_sizes, name='TEST').apply(format_mean_std), ignore_index=True)
    df_tests_micro = df_tests_micro.append(pd.Series(result_micro_f1_list, index=train_sizes, name='TEST').apply(format_mean_std), ignore_index=True)
df_tests_macro.index = names
df_tests_micro.index = names

KeyboardInterrupt: 

In [262]:
df_tests_macro

Unnamed: 0,0.8,0.6,0.4,0.2
Nearest Neighbors 4,31.7%~7.5%,31.0%~7.3%,32.2%~4.4%,32.0%~5.0%
Linear SVM,25.8%~8.0%,27.9%~6.3%,30.2%~4.1%,27.4%~3.4%
Linear SVM MAGNN,34.2%~5.4%,37.1%~4.9%,36.8%~4.6%,35.0%~3.0%
Decision Tree,35.4%~9.9%,38.5%~6.3%,35.3%~5.2%,31.8%~4.6%
Random Forest,31.6%~9.5%,32.3%~5.5%,32.8%~4.0%,33.2%~4.0%
Neural Net,35.3%~7.8%,29.6%~9.2%,31.8%~9.1%,34.2%~3.2%
AdaBoost,32.8%~9.0%,32.1%~6.4%,30.0%~3.9%,31.8%~5.8%
Naive Bayes,30.0%~8.0%,32.0%~6.3%,33.2%~2.8%,34.6%~4.5%


In [263]:
df_tests_micro

Unnamed: 0,0.8,0.6,0.4,0.2
Nearest Neighbors 4,32.9%~7.1%,32.2%~7.1%,34.1%~4.3%,33.2%~5.1%
Linear SVM,27.1%~8.2%,30.2%~4.1%,34.5%~4.1%,32.3%~2.5%
Linear SVM MAGNN,35.7%~5.1%,37.6%~4.8%,37.3%~4.4%,35.6%~3.1%
Decision Tree,37.1%~9.1%,39.5%~6.1%,36.4%~4.8%,33.5%~3.9%
Random Forest,32.9%~9.3%,34.4%~4.8%,35.5%~3.6%,35.5%~4.0%
Neural Net,40.0%~8.0%,35.1%~4.6%,35.4%~5.5%,35.9%~2.2%
AdaBoost,33.6%~9.1%,32.9%~5.8%,30.7%~4.1%,35.9%~3.4%
Naive Bayes,32.1%~8.5%,32.0%~5.9%,33.7%~3.0%,35.1%~4.4%
