In [175]:
import data_utils as du
import pandas as pd
import networkx as nx
from utils.data import load_PPMI_data

data_dir = du.find_data_dir('app')
dataloader_file = du.get_file_path(data_dir, 'class based structure', 'dataloaders', 'dataloader.p')
dataloader = du.read_from_pickle(dataloader_file)
adjlists, edge_metapath_indices_list, features_list, adjM, type_mask, labels, train_val_test_idx = load_PPMI_data()

train_idx = train_val_test_idx['train_idx']
train_idx = np.sort(train_idx)
test_idx = train_val_test_idx['test_idx']
test_idx = np.sort(test_idx)
val_idx = train_val_test_idx['val_idx']
val_idx = np.sort(val_idx)

In [176]:
dataloader.target_distribution_stats()

Unnamed: 0,Amount,Percentage
-1,91,39.9%
0,70,30.7%
1,67,29.4%
Total,228,100.0%


### Experimentation with ML models

In [177]:
import scipy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.metrics import f1_score, SCORERS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [178]:
h = .02  # step size in the mesh

names = ["Nearest Neighbors 2", "Nearest Neighbors 4", "Linear SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(2),
    KNeighborsClassifier(4),
    SVC(kernel="linear", C=0.025),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB()
]

X, y = dataloader.X.iloc[test_idx].values, np.array(dataloader.y.iloc[test_idx]).astype(int)

# X = StandardScaler(with_mean=False).fit_transform(X)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.5, random_state=43)

scores = []

# iterate over classifiers
for name, clf in zip(names, classifiers):
    print(name)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    f1_macro = f1_score(y_test, predicted, average='macro')
    f1_micro = f1_score(y_test, predicted, average='micro')
    scores.append((name, f1_macro, f1_micro))
    
df = pd.DataFrame(scores, columns=['Name', 'f1_macro', 'f1_micro']).set_index('Name', drop=True).style.format({'f1_macro': "{:.1%}", 'f1_micro': "{:.1%}"})
df

Nearest Neighbors 2
Nearest Neighbors 4
Linear SVM
Gaussian Process
Decision Tree
Random Forest
Neural Net
AdaBoost
Naive Bayes


Unnamed: 0_level_0,f1_macro,f1_micro
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Nearest Neighbors 2,31.1%,36.2%
Nearest Neighbors 4,34.7%,34.8%
Linear SVM,29.5%,31.9%
Gaussian Process,26.9%,34.8%
Decision Tree,42.0%,44.9%
Random Forest,32.4%,33.3%
Neural Net,29.0%,33.3%
AdaBoost,36.2%,36.2%
Naive Bayes,31.5%,31.9%


In [182]:
def clf_test(X, y, clf, test_sizes=(0.2, 0.4, 0.6, 0.8), repeat=10):
    random_states = [182318 + i for i in range(repeat)]
    result_macro_f1_list = []
    result_micro_f1_list = []
    for test_size in test_sizes:
        macro_f1_list = []
        micro_f1_list = []
        for i in range(repeat):
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, shuffle=True, random_state=random_states[i])
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            macro_f1 = f1_score(y_test, y_pred, average='macro')
            micro_f1 = f1_score(y_test, y_pred, average='micro')
            macro_f1_list.append(macro_f1)
            micro_f1_list.append(micro_f1)
        result_macro_f1_list.append((np.mean(macro_f1_list), np.std(macro_f1_list)))
        result_micro_f1_list.append((np.mean(micro_f1_list), np.std(micro_f1_list)))
    return result_macro_f1_list, result_micro_f1_list

In [218]:
def format_mean_std(mean_std):
    mean, std = mean_std
    mean_perc = mean*100
    std_perc = std*100
    return f'{mean_perc:.1f} % ~ {std_perc:.1f} %'

### Do correct statistical tests with Baselines for good comparison

In [234]:
train_sizes = [0.8, 0.6, 0.4, 0.2]
df_tests = pd.DataFrame([], columns=train_sizes)

for name, clf in list(zip(names, classifiers)):
    result_macro_f1_list, result_micro_f1_list = clf_test(X, y, clf)
    df_tests = df_tests.append(pd.Series(result_macro_f1_list, index=train_sizes, name='TEST').apply(format_mean_std), ignore_index=True)
df_tests.index = names

In [235]:
df_tests

Unnamed: 0,0.8,0.6,0.4,0.2
Nearest Neighbors 2,27.3 % ~ 5.7 %,27.0 % ~ 3.9 %,30.1 % ~ 4.9 %,30.6 % ~ 3.0 %
Nearest Neighbors 4,31.7 % ~ 7.5 %,31.0 % ~ 7.3 %,32.2 % ~ 4.4 %,32.0 % ~ 5.0 %
Linear SVM,25.8 % ~ 8.0 %,27.9 % ~ 6.3 %,30.2 % ~ 4.1 %,27.4 % ~ 3.4 %
Gaussian Process,17.5 % ~ 8.0 %,20.3 % ~ 4.9 %,24.2 % ~ 4.6 %,25.4 % ~ 5.0 %
Decision Tree,36.3 % ~ 9.0 %,36.7 % ~ 8.8 %,35.2 % ~ 5.9 %,31.4 % ~ 4.7 %
Random Forest,29.5 % ~ 12.6 %,31.3 % ~ 7.6 %,35.2 % ~ 5.6 %,31.8 % ~ 2.5 %
Neural Net,26.6 % ~ 11.3 %,34.6 % ~ 7.7 %,34.8 % ~ 4.6 %,32.2 % ~ 5.2 %
AdaBoost,32.8 % ~ 9.0 %,31.9 % ~ 6.4 %,30.1 % ~ 4.1 %,33.9 % ~ 6.5 %
Naive Bayes,30.0 % ~ 8.0 %,32.0 % ~ 6.3 %,33.2 % ~ 2.8 %,34.6 % ~ 4.5 %


In [172]:
def create_confusion_matrix(y_true, y_predicted):
    classes = [-1, 0, 1]
    true_actual_str = 'True/Actual'
    predicted_str = 'Predicted'

    MultiIndex_Columns = pd.MultiIndex.from_tuples(list(zip([true_actual_str]*3, classes)))
    MultiIndex_Index = pd.MultiIndex.from_tuples(list(zip([predicted_str]*3, classes)))

    df_true_predicted = pd.DataFrame(np.zeros((3,3)), columns = MultiIndex_Columns, index=MultiIndex_Index)
    for truth, prediction in list(zip(y_true, y_predicted)):
        df_true_predicted[true_actual_str, truth][predicted_str, prediction] += 1
    df_true_predicted = df_true_predicted.astype(int)
    return df_true_predicted    

def get_classifier(target, names, classifiers):
    for name, classifier in list(zip(names, classifiers)):
        if name==target:
            return classifier

In [173]:
clf = get_classifier('Neural Net', names, classifiers)
predicted = clf.predict(X_test)
create_confusion_matrix(y_test, predicted)

Unnamed: 0_level_0,Unnamed: 1_level_0,True/Actual,True/Actual,True/Actual
Unnamed: 0_level_1,Unnamed: 1_level_1,-1,0,1
Predicted,-1,0,0,0
Predicted,0,4,4,3
Predicted,1,8,8,8


In [174]:
from utils.tools import evaluate_results_nc

out_dim = 3
svm_macro_f1_list, svm_micro_f1_list, nmi_mean, nmi_std, ari_mean, ari_std = evaluate_results_nc(
    X, y, num_classes=out_dim) 

SVM test
Macro-F1: 0.391525~0.104323 (0.8), 0.379194~0.091439 (0.6), 0.366456~0.056654 (0.4), 0.296443~0.048852 (0.2)
Micro-F1: 0.428571~0.101015 (0.8), 0.392857~0.091752 (0.6), 0.385714~0.054085 (0.4), 0.328571~0.057143 (0.2)
K-means test
NMI: 0.025854~0.001376
ARI: -0.013417~0.002186
