In [1]:
import itertools
import os.path
import shutil
import time

import numpy as np
import pandas as pd
from datetime import datetime

from joblib import dump
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

label_dict = {'HAHV': 0, 'HALV': 1, 'LALV': 2, 'LAHV': 3}


def parse_id_signals(path, id):
    signals = ['BVP', 'EDA', 'EEG_ftrs']
    ret = {}
    for sgn in signals:
        ret[sgn] = np.load(os.path.join(path, '{}_{}.npy'.format(id, sgn)))
    return ret


def load_data(path):
    """
    Loading data for using with traditional machine learning methods
    :param path:
    :return:
    """

    data_loaders = {}
    for dt in ['train', 'val', 'test']:
        data_loaders[dt] = {}
        list_ids = np.loadtxt(os.path.join(path, '{}_ids.txt'.format(dt)), dtype=str)
        if dt == 'train':
            list_labels = pd.read_csv(os.path.join(path, 'train_labels.csv'))
            list_labels.sort_values('Id', inplace=True)
            assert np.sum(list_labels['Id'].values != list_ids) == 0
            labels_data = list_labels.values[:, 1]
        else:
            labels_data = -1 * np.ones(len(list_ids))

        personality = pd.read_csv(os.path.join(path, '{}_personality.csv'.format(dt)))
        personality.sort_values('Id', inplace=True)
        assert np.sum(personality['Id'].values != list_ids) == 0
        personality_data = personality.values

        data_loaders[dt]['id'] = []

        for idx in range(len(list_ids)):
            cur_id = list_ids[idx]
            ret = parse_id_signals(os.path.join(path, dt), cur_id)
            ret['personality'] = personality_data[idx, 1:]
            if dt == 'train':
                ret['label'] = label_dict[labels_data[idx]]
            else:
                ret['label'] = labels_data[idx]

            data_loaders[dt]['id'].append(cur_id)
            for fky in ret.keys():
                if fky not in data_loaders[dt].keys():
                    data_loaders[dt][fky] = []

                data_loaders[dt][fky].append(ret[fky])

        for fky in data_loaders[dt].keys():
            data_loaders[dt][fky] = np.array(data_loaders[dt][fky])

    merge_val_test = True
    if merge_val_test:
        for ky in data_loaders['test'].keys():
            data_loaders['test'][ky] = np.concatenate([data_loaders['val'][ky], data_loaders['test'][ky]])
        _ = data_loaders.pop('val')

    return data_loaders


def gridsearch_clf(x_train, y_train, model_name='svm'):
    print('-------------------\nGridsearch CV for {}'.format(model_name))
    stt = time.time()
    # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=9999)
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=999999)

    if model_name == 'svm':
        estimator = make_pipeline(StandardScaler(), PCA(),
                                  SVC(decision_function_shape='ovr', class_weight='balanced', random_state=9999, probability=True))
        # parameters = {'svc__kernel': ['linear', 'rbf'], 'svc__C': [1, 10, 100], 'svc__gamma': [1e-3, 1e-4, 1e-2]}
        num_feats = x_train.shape[1]
        parameters = [{'svc__kernel': ['rbf'], 'svc__C': np.logspace(-3, 5, num=10, endpoint=True, base=2.0),
                       'svc__gamma': np.logspace(-7, 3, num=10, endpoint=True, base=2.0),
                       'pca__n_components': np.random.randint(num_feats // 5, num_feats * 4 // 5, size=10),
                       'pca__whiten': [True, False], 'pca__random_state': [9999]},
                      ]
    elif model_name == 'rf':
        estimator = RandomForestClassifier(n_jobs=4, oob_score=True, warm_start=False)
        # estimator = make_pipeline_with_sampler(RandomUnderSampler(), RandomForestClassifier(n_jobs=4))

        parameters = {'n_estimators': np.random.randint(low=30, high=100, size=30),
                      'max_features': ['sqrt', 'log2'],
                      'criterion': ['gini', 'entropy'],
                      'random_state': np.random.randint(0, 100, 10),
                      'class_weight': ['balanced', 'balanced_subsample']}

    elif model_name == 'adaboost':
        DTC = DecisionTreeClassifier(class_weight='balanced', )
        # estimator = AdaBoostClassifier(base_estimator=DTC)
        estimator = make_pipeline_with_sampler(RandomUnderSampler(), AdaBoostClassifier(base_estimator=DTC))

        parameters = {'randomundersampler__random_state': np.random.randint(0, 100, 10),
                      'adaboostclassifier__n_estimators': np.random.randint(20, 50, 5),
                      'adaboostclassifier__learning_rate': np.arange(1e-2, 2e-1, step=0.02),
                      'adaboostclassifier__random_state': np.random.randint(0, 100, 10),
                      'adaboostclassifier__base_estimator__random_state': np.random.randint(0, 100, 7),
                      "adaboostclassifier__base_estimator__criterion": ["gini", "entropy"],
                      "adaboostclassifier__base_estimator__splitter": ["best", "random"], }
    else:
        raise ValueError('Unkown estimator {}'.format(model_name))

    clf = GridSearchCV(estimator=estimator, param_grid=parameters,
                       scoring={'f1': 'f1_micro', 'acc': 'accuracy'}, n_jobs=-1, cv=cv,
                       refit='f1')

    clf.fit(x_train, y_train)
    print('Best score using {}: {}'.format(model_name, clf.best_score_))
    print('Total time: {:5f} second'.format(time.time()-stt))
    return clf


def select_feat_type(data_loaders, feat_list=('eeg_ftrs',)):
    ret_data = {}
    for dt in ['train', 'test']:
        ret_data[dt] = {}
        sel_feat = []
        for feat in feat_list:
            sel_feat.append(data_loaders[dt][feat])

        sel_feat = np.concatenate(sel_feat, axis=-1)
        ret_data[dt]['feat'] = sel_feat
        if dt == 'train':
            ret_data[dt]['label'] = data_loaders[dt]['label']

        ret_data[dt]['id'] = data_loaders[dt]['id']

    return ret_data


def prediction_2_csv(prediction, id, save_name):
    list_cat = np.array(['HAHV', 'HALV', 'LALV', 'LAHV'])
    write_pred = list_cat[prediction]
    pd.DataFrame(np.stack([id, write_pred], axis=-1), columns=['Id', 'Predicted']).to_csv(save_name, index=False)

In [2]:
now = datetime.now()
save_folder = now.strftime("%m-%d-%Y_%H-%M-%S")
# save_folder = 'tmp'
save_path = os.path.join('/home/nghia/PR_LAB/KERC_Chall/kerc2021/KERC21Baseline/KERC21Baseline/train_logs', save_folder)

print('Save folder: ', os.path.join(save_path, save_folder))

os.makedirs('/home/nghia/PR_LAB/KERC_Chall/kerc2021/KERC21Baseline/KERC21Baseline/train_logs', exist_ok=True)
os.makedirs(save_path, exist_ok=True)
shutil.copy2('trad.py', save_path)

dataset = load_data('/home/nghia/PR_LAB/KERC_Chall/kerc2021/KERC21Dataset/KERC21Dataset/preprocessed')

full_feat_list = ['EEG_ftrs', 'BVP', 'EDA', 'personality']

list_train_probas = []
list_test_probas = []
test_id = None

for num_comb_feat in range(1, len(full_feat_list) + 1):
    for subset in itertools.combinations(full_feat_list, num_comb_feat):
        dataset_sel_feat = select_feat_type(dataset, feat_list=subset)
        x_train, y_train = dataset_sel_feat['train']['feat'], dataset_sel_feat['train']['label']
        x_test = dataset_sel_feat['test']['feat']
        test_id = dataset_sel_feat['test']['id']
        postfix = '_'.join(x for x in subset)

        if not (num_comb_feat == 1 and 'personality' in subset):
            # Use SVM in all the case except when only personality
            clf_svm = gridsearch_clf(x_train, y_train, model_name='svm')
            dump(clf_svm, os.path.join(save_path, 'clf_svm_{}.joblib'.format(postfix)))
            y_test_svm = clf_svm.predict(x_test)
            prediction_2_csv(y_test_svm, dataset_sel_feat['test']['id'], os.path.join(save_path, 'test_pred_svm_{}.csv'.format(postfix)))

            if clf_svm.best_score_ >= 0.55:
                list_train_probas.append(clf_svm.predict_proba(x_train))
                list_test_probas.append(clf_svm.predict_proba(x_test))

            with open(os.path.join(save_path, 'logs.txt'), 'a+') as fd:
                fd.write('-----------------------------------------------\n')
                fd.write('SVM {}: {}\n'.format(postfix, clf_svm.best_score_))
                fd.write(str(clf_svm.best_params_))
                print(postfix, '\n', clf_svm.best_params_)

        if 'EEG_ftrs' in subset:
            # Use random forest whenever EEG_ftrs in list of features to be used
            clf_rf = gridsearch_clf(x_train, y_train, model_name='rf')
            dump(clf_rf, os.path.join(save_path, 'clf_rf_{}.joblib'.format(postfix)))
            y_test_rf = clf_rf.predict(x_test)
            prediction_2_csv(y_test_rf, dataset_sel_feat['test']['id'], os.path.join(save_path, 'test_pred_rf_{}.csv'.format(postfix)))

            if clf_rf.best_score_ > 0.55:
                list_train_probas.append(clf_rf.predict_proba(x_train))
                list_test_probas.append(clf_rf.predict_proba(x_test))

            with open(os.path.join(save_path, 'logs.txt'), 'a+') as fd:
                fd.write('-----------------------------------------------\n')
                fd.write('RF {}: {}\n'.format(postfix, clf_rf.best_score_))
                fd.write(str(clf_rf.best_params_))
                fd.write('\n')
                print(postfix, '\n', clf_rf.best_params_)

Save folder:  /home/nghia/PR_LAB/KERC_Chall/kerc2021/KERC21Baseline/KERC21Baseline/train_logs/11-02-2021_00-05-22/11-02-2021_00-05-22
-------------------
Gridsearch CV for svm
Best score using svm: 0.6036231884057971
Total time: 248.340766 second
EEG_ftrs 
 {'pca__n_components': 229, 'pca__random_state': 9999, 'pca__whiten': False, 'svc__C': 9.332232316608927, 'svc__gamma': 0.0078125, 'svc__kernel': 'rbf'}
-------------------
Gridsearch CV for rf
Best score using rf: 0.5710144927536231
Total time: 431.304775 second
EEG_ftrs 
 {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 99, 'random_state': 13}
-------------------
Gridsearch CV for svm
Best score using svm: 0.44999999999999996
Total time: 100.509004 second
BVP 
 {'pca__n_components': 11, 'pca__random_state': 9999, 'pca__whiten': True, 'svc__C': 2.7215800003487542, 'svc__gamma': 0.7937005259840999, 'svc__kernel': 'rbf'}
-------------------
Gridsearch CV for svm
Best score using sv

In [3]:
num_comb_feat


4

In [8]:
#subset
x_train
#y_train
x_test
#test_id
#postfix

In [5]:
print('Number of models to be averaging: ', len(list_train_probas))
list_train_probas = np.stack(list_train_probas)
avg_train_probas = np.mean(list_train_probas, axis=0)
avg_train_preds = np.argmax(avg_train_probas, axis=1)
print('Avg train F1-score: ', f1_score(y_true=y_train, y_pred=avg_train_preds, average='micro'))

Number of models to be averaging:  16
Avg train F1-score:  1.0


In [7]:
list_train_probas

array([[[0.07073006, 0.10122745, 0.76477537, 0.06326713],
        [0.02269246, 0.80163304, 0.12269835, 0.05297615],
        [0.0417527 , 0.81032509, 0.10983912, 0.03808309],
        ...,
        [0.02477591, 0.79581555, 0.11864011, 0.06076843],
        [0.03248794, 0.12136428, 0.78518188, 0.0609659 ],
        [0.01057916, 0.80624254, 0.12066934, 0.06250896]],

       [[0.02020202, 0.2020202 , 0.74747475, 0.03030303],
        [0.        , 0.80808081, 0.17171717, 0.02020202],
        [0.03030303, 0.77777778, 0.14141414, 0.05050505],
        ...,
        [0.        , 0.81818182, 0.16161616, 0.02020202],
        [0.01010101, 0.11111111, 0.78787879, 0.09090909],
        [0.01010101, 0.75757576, 0.17171717, 0.06060606]],

       [[0.05330305, 0.10536141, 0.78401518, 0.05732036],
        [0.01873793, 0.8148416 , 0.11717211, 0.04924836],
        [0.05208481, 0.82014341, 0.09929172, 0.02848006],
        ...,
        [0.02124357, 0.81108496, 0.115092  , 0.05257948],
        [0.03200564, 0.108613

In [9]:
list_test_probas = np.stack(list_test_probas)
avg_test_probas = np.mean(list_test_probas, axis=0)
avg_test_preds = np.argmax(avg_test_probas, axis=1)
prediction_2_csv(avg_test_preds, test_id, os.path.join(save_path, 'test_pred_avg_{}.csv'.format('all_models')))
