In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import stumpy
import time
import ast
import json
import seaborn as sns

from sklearn.model_selection import train_test_split

from utils import Utils
utils = Utils()

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score as accuracy

from sklearn.model_selection import cross_validate

#### Methods

In [None]:
def run_evaluation_cv(X, y, fs=True):
    f1_train_list = []
    f1_test_list = []
    acc_train_list = []
    acc_test_list = []
    
    for model in models:
        
        if fs:
            features = run_feature_selection2(RFE, model)
            X_selected = X[:, features]
            selected_columns = training_set.columns[features]
            print(sorted(selected_columns))
        else:
            X_selected = X
        
        results = cross_validate(model, X_selected, y, cv=5, scoring=['accuracy', 'f1'], return_train_score=True)
    
        f1_train_list.append(results['train_f1'].mean())
        acc_train_list.append(results['train_accuracy'].mean())
        f1_test_list.append(results['test_f1'].mean())
        acc_test_list.append(results['test_accuracy'].mean())
        
    result_df = pd.DataFrame()
    result_df["model"] = models
    result_df["F1_train"] = f1_train_list
    result_df["Acc_train"] = acc_train_list
    result_df["F1_val"] = f1_test_list
    result_df["Acc_val"] = acc_test_list
    return result_df

In [None]:
def evaluate_by_gender(y_true, y_pred, gender_labels):
    y_true = np.array(y_true)
    gender_0_ind = np.where(np.array(gender_labels) == 0)
    gender_1_ind = np.where(np.array(gender_labels) == 1)
    #print(gender_labels)
    #print("overall acc:", accuracy(y_pred, y_true))
    ##sns.heatmap(confusion_matrix(y_pred, y_true), annot=True, fmt='d')
    #plt.show()
    
    gender_0_accuracy = accuracy(y_pred[gender_0_ind], y_true[gender_0_ind])
    #print("Gender 0 accuracy:", gender_0_accuracy)
    #sns.heatmap(confusion_matrix(y_pred[gender_0_ind], y_true[gender_0_ind]), annot=True, fmt='d')
    #plt.show()
               
    gender_1_accuracy = accuracy(y_pred[gender_1_ind], y_true[gender_1_ind])
    #print("Gender 1 accuracy:", gender_1_accuracy)
    #sns.heatmap(confusion_matrix(y_pred[gender_1_ind], y_true[gender_1_ind]), annot=True, fmt='d')
    #plt.show()
    
    return gender_0_accuracy, gender_1_accuracy
    
    
#trained_model, f1_train, f1_test, acc_train, acc_test = fit_evaluate_model(models[0], X_train, X_test_final, y_train, y_test_final, print_evaluation=False)

In [None]:
def fit_evaluate_model(model, X_train, X_test, y_train, y_test, train_gender, validation_gender, model_name=None, print_evaluation=True):
    model.fit(X_train, y_train)
    
    #print("Training set")
    y_pred = model.predict(X_train)
    f1_train = f1_score(y_train, y_pred)
    acc_train = accuracy(y_train, y_pred)
    #print("F1:", f1_train)
    #print("Acc:", acc_train)
    
    if print_evaluation:

        print(classification_report(y_train,y_pred))
        cm = confusion_matrix(y_train,y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.show()
        
    #print(len(y_train))
    #print(len(train_gender))
    gender_train_acc = evaluate_by_gender(y_train, y_pred, train_gender)
    
    y_pred = model.predict(X_test)

    #print("Testing set")
    f1_test = f1_score(y_test, y_pred)
    acc_test = accuracy(y_test, y_pred)
    #print("F1:", f1_test)
    #print("Acc:", acc_test)

    if print_evaluation:

        print(classification_report(y_test,y_pred))
        cm = confusion_matrix(y_test,y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.show()
        
    gender_test_acc = evaluate_by_gender(y_test, y_pred, validation_gender)
        
    
    return model, f1_train, f1_test, acc_train, acc_test, gender_train_acc, gender_test_acc

In [None]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# obtained after hyperparameter tunning
models = [
    SVC(kernel='linear', C=0.05),  
    LinearSVC(penalty="l1", dual=False),
    DecisionTreeClassifier(max_depth=3,criterion="gini"),
    RandomForestClassifier(n_estimators=5,max_depth=3,min_samples_leaf=2,min_samples_split=4, random_state=2), 
    LogisticRegression(C=0.7),
    MLPClassifier(hidden_layer_sizes=(5,10,5))
]

In [None]:
from sklearn.feature_selection import RFE

def run_feature_selection2(selector_class, model, estimator=None):
    f1s = []
    f1s_train = []
    best_chosen = []
    f1_max = 0.0
    f1_train_ = None
    
    for i in range(3, X.shape[1]):

        if estimator == None:
            selector = selector_class(model, n_features_to_select=i, step=1)
        else:
            selector = selector_class(estimator, n_features_to_select=i, step=1)
        selector = selector.fit(X, y)
        chosen = selector.get_support()
        X_selected = X[:, chosen]
        #print(X_train_selected.shape)

        results = cross_validate(model, X_selected, y, cv=5, scoring=['accuracy', 'f1'], return_train_score=True)
        f1_train = results['train_f1'].mean()
        f1 = results['test_f1'].mean()

        #print(f1_train)
        f1s_train.append(f1_train)
        #print(f1)
        f1s.append(f1)
        
        if f1 > f1_max:
            f1_max = f1
            f1_train_ = f1_train
            best_chosen = chosen
            
        if f1_train > 0.95 and f1_train - f1 > 0.4:
            print("Overfitting..")
            break
        
    fig_size = plt.rcParams["figure.figsize"]
    fig_size[0] = 10
    fig_size[1] = 5
    plt.rcParams["figure.figsize"] = fig_size

    plt.plot(range(1,len(f1s)+1), f1s, label="Validation set")
    plt.plot(range(1,len(f1s)+1), f1s_train, label="Training set")
    plt.xlabel("Number of features")
    plt.ylabel("F1 score")
    plt.legend()
    plt.show()
        
    print("--------------")
    indexes = list(range(X.shape[1]))
    chosen_indexes = np.array(indexes)[best_chosen]
    print(chosen_indexes)
    print(len(chosen_indexes))
    print("Training:", f1_train_)
    print("Testing:", f1_max)
    
    return chosen_indexes

In [None]:
def read_datasets(band, path="./feature_matrices/"):
    training_set = pd.read_csv(path+"motifs_{}_train.csv".format(band))
    validation_set = pd.read_csv(path+"motifs_{}_val.csv".format(band))
    test_set = pd.read_csv(path+"motifs_{}_test.csv".format(band))
    return training_set, validation_set, test_set

#### Main

In [None]:
for band in ["alpha", "beta", "theta"]:

    best_f1s = []

    training_set, validation_set, testing_set = read_datasets(band, path="./feature_matrices/")
    target = "label"
    y_train = training_set[target]
    X_train = np.array(training_set.drop(target, axis=1))
    y_val = validation_set[target]
    X_val = np.array(validation_set.drop(target, axis=1))
    X_test = np.array(testing_set.drop(target, axis=1))
    y_test = testing_set[target]

    X = np.concatenate((X_train, X_val, X_test), axis=0)
    y = np.concatenate((y_train, y_val, y_test))

    #result_df = run_evaluation_result(X_train, X_test, y_train, y_test)
    #display(result_df)

    print("CV:")
    result_df = run_evaluation_cv(X, y)
    ind_max = result_df['F1_val'].idxmax()

    display(result_df)
    best_f1s.append(result_df.iloc[ind_max])
    print(result_df.iloc[ind_max])


    print("===============================================================")