In [175]:
import data_utils as du
import pandas as pd
import networkx as nx
from utils.data import load_PPMI_data

data_dir = du.find_data_dir('app')
dataloader_file = du.get_file_path(data_dir, 'class based structure', 'dataloaders', 'dataloader.p')
dataloader = du.read_from_pickle(dataloader_file)
adjlists, edge_metapath_indices_list, features_list, adjM, type_mask, labels, train_val_test_idx = load_PPMI_data()

train_idx = train_val_test_idx['train_idx']
train_idx = np.sort(train_idx)
test_idx = train_val_test_idx['test_idx']
test_idx = np.sort(test_idx)
val_idx = train_val_test_idx['val_idx']
val_idx = np.sort(val_idx)

In [10]:
dataloader.target_distribution_stats()

Unnamed: 0,Amount,Percentage
-1,91,39.9%
0,70,30.7%
1,67,29.4%
Total,228,100.0%


### Experimentation with ML models

In [12]:
import scipy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.metrics import f1_score, SCORERS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [13]:
h = .02  # step size in the mesh

names = ["Nearest Neighbors 4", "Linear SVM", "Linear SVM MAGNN",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(4),
    SVC(kernel="linear", C=0.025),
    LinearSVC(dual=False),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB()
]

X, y = dataloader.X.iloc[test_idx].values, np.array(dataloader.y.iloc[test_idx]).astype(int)

# X = StandardScaler(with_mean=False).fit_transform(X)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.5, random_state=43)

scores = []

# iterate over classifiers
# for name, clf in zip(names, classifiers):
#     print(name)
#     clf.fit(X_train, y_train)
#     predicted = clf.predict(X_test)
#     f1_macro = f1_score(y_test, predicted, average='macro')
#     f1_micro = f1_score(y_test, predicted, average='micro')
#     scores.append((name, f1_macro, f1_micro))
    
# df = pd.DataFrame(scores, columns=['Name', 'f1_macro', 'f1_micro']).set_index('Name', drop=True).style.format({'f1_macro': "{:.1%}", 'f1_micro': "{:.1%}"})
# df

In [248]:
def clf_test(X, y, clf, test_sizes=(0.2, 0.4, 0.6, 0.8), repeat=10):
    random_states = [182318 + i for i in range(repeat)]
    result_macro_f1_list = []
    result_micro_f1_list = []
    for test_size in test_sizes:
        macro_f1_list = []
        micro_f1_list = []
        for i in range(repeat):
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, shuffle=True, random_state=random_states[i])
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            macro_f1 = f1_score(y_test, y_pred, average='macro')
            micro_f1 = f1_score(y_test, y_pred, average='micro')
            macro_f1_list.append(macro_f1)
            micro_f1_list.append(micro_f1)
        result_macro_f1_list.append((np.mean(macro_f1_list), np.std(macro_f1_list)))
        result_micro_f1_list.append((np.mean(micro_f1_list), np.std(micro_f1_list)))
    return result_macro_f1_list, result_micro_f1_list

In [249]:
def format_mean_std(mean_std):
    mean, std = mean_std
    mean_perc = mean*100
    std_perc = std*100
    return f'{mean_perc:.1f}%~{std_perc:.1f}%'

In [250]:
X.shape, y.shape 

((137, 255), (137,))

### Do correct statistical tests with Baselines for good comparison

In [259]:
train_sizes = [0.8, 0.6, 0.4, 0.2]
df_tests_macro = pd.DataFrame([], columns=train_sizes)
df_tests_micro = pd.DataFrame([], columns=train_sizes)

for name, clf in list(zip(names, classifiers)):
    result_macro_f1_list, result_micro_f1_list = clf_test(X, y, clf)
    df_tests_macro = df_tests_macro.append(pd.Series(result_macro_f1_list, index=train_sizes, name='TEST').apply(format_mean_std), ignore_index=True)
    df_tests_micro = df_tests_micro.append(pd.Series(result_micro_f1_list, index=train_sizes, name='TEST').apply(format_mean_std), ignore_index=True)
df_tests_macro.index = names
df_tests_micro.index = names

In [262]:
df_tests_macro

Unnamed: 0,0.8,0.6,0.4,0.2
Nearest Neighbors 4,31.7%~7.5%,31.0%~7.3%,32.2%~4.4%,32.0%~5.0%
Linear SVM,25.8%~8.0%,27.9%~6.3%,30.2%~4.1%,27.4%~3.4%
Linear SVM MAGNN,34.2%~5.4%,37.1%~4.9%,36.8%~4.6%,35.0%~3.0%
Decision Tree,35.4%~9.9%,38.5%~6.3%,35.3%~5.2%,31.8%~4.6%
Random Forest,31.6%~9.5%,32.3%~5.5%,32.8%~4.0%,33.2%~4.0%
Neural Net,35.3%~7.8%,29.6%~9.2%,31.8%~9.1%,34.2%~3.2%
AdaBoost,32.8%~9.0%,32.1%~6.4%,30.0%~3.9%,31.8%~5.8%
Naive Bayes,30.0%~8.0%,32.0%~6.3%,33.2%~2.8%,34.6%~4.5%


In [263]:
df_tests_micro

Unnamed: 0,0.8,0.6,0.4,0.2
Nearest Neighbors 4,32.9%~7.1%,32.2%~7.1%,34.1%~4.3%,33.2%~5.1%
Linear SVM,27.1%~8.2%,30.2%~4.1%,34.5%~4.1%,32.3%~2.5%
Linear SVM MAGNN,35.7%~5.1%,37.6%~4.8%,37.3%~4.4%,35.6%~3.1%
Decision Tree,37.1%~9.1%,39.5%~6.1%,36.4%~4.8%,33.5%~3.9%
Random Forest,32.9%~9.3%,34.4%~4.8%,35.5%~3.6%,35.5%~4.0%
Neural Net,40.0%~8.0%,35.1%~4.6%,35.4%~5.5%,35.9%~2.2%
AdaBoost,33.6%~9.1%,32.9%~5.8%,30.7%~4.1%,35.9%~3.4%
Naive Bayes,32.1%~8.5%,32.0%~5.9%,33.7%~3.0%,35.1%~4.4%


In [172]:
def create_confusion_matrix(y_true, y_predicted):
    classes = [-1, 0, 1]
    true_actual_str = 'True/Actual'
    predicted_str = 'Predicted'

    MultiIndex_Columns = pd.MultiIndex.from_tuples(list(zip([true_actual_str]*3, classes)))
    MultiIndex_Index = pd.MultiIndex.from_tuples(list(zip([predicted_str]*3, classes)))

    df_true_predicted = pd.DataFrame(np.zeros((3,3)), columns = MultiIndex_Columns, index=MultiIndex_Index)
    for truth, prediction in list(zip(y_true, y_predicted)):
        df_true_predicted[true_actual_str, truth][predicted_str, prediction] += 1
    df_true_predicted = df_true_predicted.astype(int)
    return df_true_predicted    

def get_classifier(target, names, classifiers):
    for name, classifier in list(zip(names, classifiers)):
        if name==target:
            return classifier

In [264]:
clf = get_classifier('Decision Tree', names, classifiers)
predicted = clf.predict(X_test)
create_confusion_matrix(y_test, predicted)

Unnamed: 0_level_0,Unnamed: 1_level_0,True/Actual,True/Actual,True/Actual
Unnamed: 0_level_1,Unnamed: 1_level_1,-1,0,1
Predicted,-1,16,7,6
Predicted,0,3,5,2
Predicted,1,9,8,13


In [255]:
from utils.tools import evaluate_results_nc

out_dim = 3
svm_macro_f1_list, svm_micro_f1_list, nmi_mean, nmi_std, ari_mean, ari_std = evaluate_results_nc(
    X, y, num_classes=out_dim) 

SVM test
Macro-F1: 0.342111~0.053907 (0.8), 0.371087~0.049454 (0.6), 0.368230~0.045984 (0.4), 0.350353~0.029978 (0.2)
Micro-F1: 0.357143~0.050508 (0.8), 0.376364~0.048139 (0.6), 0.373494~0.044104 (0.4), 0.356364~0.030641 (0.2)
K-means test
NMI: 0.003305~0.000000
ARI: -0.010166~0.000000


In [31]:
macro_f1_list = []
micro_f1_list = []
for i in range(1000):
    y_random_pred = np.random.randint(low=-1, high=2, size=len(y_test))
    macro_f1_list.append(f1_score(y_test, y_random_pred, average='macro'))
    micro_f1_list.append(f1_score(y_test, y_random_pred, average='micro'))
print(np.mean(macro_f1_list), np.std(macro_f1_list))
print(np.mean(micro_f1_list), np.std(micro_f1_list))
macro_f1_list

0.32791646468572044 0.06869385425159476
0.33369565217391306 0.06898364234478431


[0.4308571696300982,
 0.39716952725082805,
 0.3244809047089358,
 0.35738095238095235,
 0.2619047619047619,
 0.4047619047619048,
 0.271604938271605,
 0.34726576105886453,
 0.2703619909502262,
 0.2871805613741098,
 0.27432126696832576,
 0.35221674876847286,
 0.3280022446689113,
 0.29859943977591036,
 0.36095238095238097,
 0.28062678062678065,
 0.14294750158127767,
 0.36315505891723476,
 0.30484151863462206,
 0.4062397372742201,
 0.34640522875817,
 0.3482758620689655,
 0.26612002376708255,
 0.2613154960981048,
 0.252262443438914,
 0.32169312169312175,
 0.31322751322751324,
 0.30513070077864296,
 0.2630684727458921,
 0.3466583240637746,
 0.32863355780022446,
 0.3647992530345472,
 0.41206529109754914,
 0.2503303303303303,
 0.3059757236227824,
 0.27742027742027736,
 0.3002380952380952,
 0.3052048260381594,
 0.37045791620318874,
 0.37415824915824913,
 0.2837401795735129,
 0.2232422640031336,
 0.3689747868001483,
 0.26153846153846155,
 0.41407168037602826,
 0.2670442325614739,
 0.3065476190476