In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, KFold

import keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

from scipy import stats
import glob
import time

In [3]:
training_src=[]
test_src = []
validation_src = []

In [14]:
file_list = glob.glob("data/intermediate/*.txt")
for file in file_list:
    f_type = file.split("_pred_")[-1].split("_")[0]
    if f_type == "train":
        training_src.append(file)
    elif f_type == "val":
        validation_src.append(file)
    elif f_type == "test":
        test_src.append(file)

In [5]:
def read_data(src):
    with open(src, 'r') as myfile:
        data=myfile.read().replace('\n', '')
    return data

In [6]:
def treat_data(data_string):
    data_string = data_string.replace("]", "")
    data_string = data_string.replace(" ", "")
    data_split = data_string.split("[")
    data_split = [(d.split(",")) for d in data_split]
    data_split_clean = [d[:-1] for d in data_split[:-1]]
    data_split_clean.append(data_split[-1])
    data_split_clean = [d for d in data_split_clean if d]
    return data_split_clean

In [7]:
def split_data(data):
    inp = []
    target = []
    for d in data:
        inp.append([float(x) for x in d[:-1]])
        target.append(float(d[-1]))
    inp = np.array(inp)
    target = np.array(target)
    return inp,target

In [8]:
def get_data_from_source (src):
    data_string = read_data(src)
    data_split = treat_data(data_string)
    return split_data(data_split)

In [37]:
def evaluate(model, data):
    y_pred = model.predict(data[0])
    try: predictions = [round(value) for value in y_pred]
    except: predictions = [np.argmax(value) for value in y_pred]
    # evaluate predictions
    d = {}
    d["accuracy"] = accuracy_score(data[1], predictions)
    d["confusion matrix"] = confusion_matrix(data[1], predictions)
    d["precision"] = precision_score(data[1], predictions, average='macro')
    d["recall"] = recall_score(data[1], predictions, average='macro')
    d["f1-score"] = f1_score(data[1], predictions, average='macro')
    # d["roc-auc"] = roc_auc_score(data[1], predictions, )
    d["cohen's kappa"] = cohen_kappa_score(data[1], predictions)
    
    return d

In [36]:
def get_measures(measure, data_type="data", measure_name="Accuracy"):
    return "%s in %s: %.2f" % (measure_name, data_type, measure)

In [35]:
def print_measures(evaluation, t):
    for key in evaluation.keys():
        if key!="confusion matrix":
            print (get_measures(evaluation[key], t, key))
        else:
            print(key)
            print(evaluation[key])

In [11]:
def get_data_from_sources (training_src, test_src, validation_src=""):
    train = get_data_from_source(training_src)
    test = get_data_from_source(test_src)
    try:
        if validation_src !="":
            val = get_data_from_source(validation_src)
        else:
            inp_test, inp_val, target_test, target_val = train_test_split(*test)
            test = (inp_test, target_test)
            val = (inp_val, target_val)
    except: 
        inp_test, inp_val, target_test, target_val = train_test_split(*test)
        test = (inp_test, target_test)
        val = (inp_val, target_val)
    return train, val, test

In [15]:
train_data = []
test_data = []
val_data = []

ind = 0
for ind, _ in enumerate(training_src):
    if ind>=len(validation_src):
        val_src = ""
    else:
        val_src = validation_src[ind]
    train, val, test = get_data_from_sources(training_src[ind], test_src[ind], val_src)
    train_data.append(train)
    test_data.append(test)
    val_data.append(val)

In [16]:
num_classes = int(max(train_data[0][1])+1)

In [17]:
mlp_models = []

In [18]:
model = Sequential()
model.add(Dense(1024, input_shape=(2048,)))
model.add(Activation('elu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=0.0001),
              metrics=['accuracy'])

mlp_models.append(model)

In [19]:
model = Sequential()
model.add(Dense(512, input_shape=(2048,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=0.0001),
              metrics=['accuracy'])

mlp_models.append(model)

In [20]:
model = Sequential()
model.add(Dense(num_classes, input_shape=(2048,)))
model.add(Activation('softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=0.0001),
              metrics=['accuracy'])

mlp_models.append(model)

In [21]:
epochs=50

In [23]:
svm_candidates = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  # {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

In [24]:
rfe_candidates = [
  {'n_estimators': [10, 100, 1000], 'criterion': ['gini', "entropy"]}
]

In [25]:
adb_candidates = [
  {'n_estimators': [10, 100, 1000], 'learning_rate': [1.0, 0.5, 0.1], 'algorithm':["SAMME.R", "SAMME"]}
]

In [26]:
gbc_candidates = [
  {'n_estimators': [10, 100, 1000], 'learning_rate': [0.05, 0.1, 0.5], 'criterion':["friedman_mse", "mae"]}
]

In [27]:
lr_candidates= [
    {'penalty': ["l2", "l5"]}
]

In [28]:
dtc_candidates= [
    {'criterion': ["l1", "l2"], 'splitter':['best']}
]

In [29]:
xgbc_candidates = [
    {
        'max_depth':[10], 'learning_rate':[0.1], 'n_estimators':[1000]     # objective, booster
    }
]

In [31]:
parameter_candidates = {}
parameter_candidates ["svm"] = svm_candidates
parameter_candidates["rfe"] = rfe_candidates
parameter_candidates ["adb"] = adb_candidates
parameter_candidates["gbc"] = gbc_candidates
parameter_candidates["lr"] = lr_candidates
parameter_candidates ["dtc"] = dtc_candidates
parameter_candidates["xgbc"] = xgbc_candidates

In [32]:
rfe = RandomForestClassifier()
adb = AdaBoostClassifier()
gbc = GradientBoostingClassifier()
lr = LogisticRegression()
dtc = DecisionTreeClassifier()#
xgbc = XGBClassifier()
svc = svm.SVC()
# Genetic Programming-based

benchmark_models = {"rfe":rfe, "xgbc":xgbc, "adb":adb, "gbc":gbc, "lr":lr, "dtc":dtc, "svm":svc}
# benchmark_models = {"adb":adb, "gbc":gbc, "lr":lr, "dtc":dtc, "svm":svc}
for x in range(len(mlp_models)):
    benchmark_models["mlp_{}".format(x)] = mlp_models[x] 

In [None]:
def grid_search_model(benchmark_models, parameter_candidates, train, val, test, folds = 4):
    cv_results = {}
    for model_key in benchmark_models.keys():
        model = benchmark_models[model_key]
        print ((" Model: " + str(model_key)+ " ").center(30, '#'))
        try:
            clf = GridSearchCV(estimator=model, param_grid=parameter_candidates[model_key], n_jobs=-1, cv=folds)
            clf.fit (*train, n_jobs=1)
            model = clf.best_estimator_
            cv_results[model_key] = clf.cv_results_
        except ValueError:
            try:
                kf = KFold(n_splits=folds)
                cv_results[model_key] = []
                for train_index, test_index in kf.split(X,):
                    X_train, X_test = train[0][train_index], train[0][test_index]
                    y_train, y_test = train[1][train_index], train[1][test_index]
                    history = model.fit(X_train, to_categorical(y_train), epochs = epochs, validation_data = (X_test, to_categorical(y_test)))
                    train_res = model.evaluate(X_train, y=y_train)
                    cv_results[model_key].append(train_res)
                benchmark_models[model_key] = model
            except:
                print("Problem with input shape")
                continue
        train_eval = evaluate(model, train)
        val_eval = evaluate(model, val)
        test_eval = evaluate(model, test)
        print_measures(train_eval, "Train")
        print_measures(val_eval, "Validation")
        print_measures(test_eval, "Test")  

        benchmark_models[model_key] = model
    return benchmark_models, cv_results

In [40]:
final_models = []
cv_results = []
for ind, _ in enumerate(train_data):
    m,cv = grid_search_model(benchmark_models, parameter_candidates, train_data[ind], val_data[ind], test_data[ind])
    final_models.append(m)
    cv_results.append(cv)

######### Model: rfe #########


ImportError: [joblib] Attempting to do parallel computing without protecting your import on a system that does not support forking. To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'". Please see the joblib documentation on Parallel for more information