In [290]:
import pandas as pd
import numpy as np
import random as random
from datetime import datetime
import tensorflow as tf
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn import tree, ensemble
from sklearn import svm, linear_model, neighbors
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

pd.set_option('display.max_columns', 60)

# Load data

In [283]:
data = pd.read_csv("hcv_descriptors.csv", sep='\t')
print("Data size: {}".format(data.shape))
print("Class distribution:")
print(data['ActivityTag'].value_counts())

Data size: (441, 51)
Class distribution:
1    242
0    199
Name: ActivityTag, dtype: int64


In [4]:
targets = data['ActivityTag']
features = data.iloc[:, 11:51].copy()
x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.25, random_state=123)

# Helper functions

In [310]:
def get_model(model_type, args, random_seed=123):
    if (model_type == "LR"): # Logistic Regression
        model = linear_model.LogisticRegression(C=args[0], solver=args[1][0], penalty=args[1][1], dual=False, tol=0.0001, max_iter=10000, random_state=random_seed)
    elif (model_type == "KNN"):
        model = neighbors.KNeighborsClassifier(n_neighbors=args[0], metric=args[1], weights=args[2])
    elif (model_type == "SVM"): # Support Vector Machine
        model = svm.SVC(kernel=args[0], C=args[1], probability=True)
    elif (model_type == "DT"): # Decision Tree
        model = tree.DecisionTreeClassifier(max_features=args[0], criterion=args[1], min_samples_leaf=args[2], random_state=random_seed)
    elif (model_type == "RF"): # Random Forest
        model = ensemble.RandomForestClassifier(n_estimators=args[0], criterion=args[1], min_samples_leaf=args[2], random_state=random_seed)
    elif (model_type == "GBC"): # Gradient Boosting Classifier
        model = ensemble.GradientBoostingClassifier(n_estimators=args[0], learning_rate=args[1], min_samples_leaf=args[2], loss='deviance',  subsample=1.0, random_state=random_seed)
    elif (model_type == "NN"): # Neural Network
        tf.random.set_seed(random_seed)
        model = Sequential()
        model.add(Dense(args[0], input_dim=40, activation='relu'))
        for j in range(args[2]):
            model.add(Dense(args[1], activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        print("Model type not supported")
    return model


def stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, model_type, metric = ["F1"], random_seed=123):
    print("Model type: {}".format(model_type))
    top_df = pd.DataFrame([[0, 0, 0, 0]], columns=["TN", "FP", "FN", "TP"])

    for i in range(0, len(parameter_list)):
        print("Expanding parameter: {}".format(parameter_list[i]))
        all_results = []
        model_dict = {}
        for row in top_df.itertuples():
            for x in parameter_dict[parameter_list[i]]:
                args = []
                for j in range(1, i+1):
                    args.append(row[4+j])
                args.append(x)
                args = args + default_args[len(args):]
                print(args)

                pairs = list(zip(parameter_list, args))
                desc = "|".join([x + ":" + str(y) for (x,y) in pairs])
                
                model = get_model(model_type, args, random_seed)
                model_dict[desc] = model
                if model_type in ["LR", "KNN", "SVM", "DT", "RF", "GBC"]:
                    model.fit(x_train.values, y_train)
                    predictions = model.predict(x_test.values)
                    cm = metrics.confusion_matrix(y_test, predictions)
                elif (model_type == "NN"):
                    model.fit(x_train, y_train, epochs=30, batch_size=10, verbose=0)
                    predictions = model.predict(x_test)
                    cm = metrics.confusion_matrix(y_test, np.where(predictions < 0.5, 0, 1))
                else:
                    print("Model type not supported")
                    
                results = cm.ravel().tolist()
                for j in range(0, i+1):
                    results.append(args[j])
                results.append(desc)
                all_results.append(results)

        cols = ["TN", "FP", "FN", "TP"] + parameter_list[:i+1] + ["desc"]
        df = pd.DataFrame(all_results, columns=cols)
        df["correct"] = df["TP"] + df["TN"]
        df["errors"] = df["FP"] + df["FN"]
        df["F1"] = df["TP"] / (df["TP"] + 0.5 * (df["FP"] + df["FN"])) 
        df = df.sort_values(by=[metric[0]], ascending=False)
        top_df = df.head(3)
        print(top_df[["TN", "FP", "FN", "TP"] + parameter_list[:i+1] + [metric[0]]])
        
    df2 = top_df[["desc"] + metric]
    df2 = df2.assign(type=model_type)
    df2 = df2[["type", "desc"] + metric]
    model_desc_filtered = {}
    for desc in df2["desc"]:
        model_desc_filtered[desc] = model_dict[desc]
    return df2, model_desc_filtered


def random_parameter_exploration(parameter_list, parameter_dict, model_type, metric = ["F1"], random_seed=123):
    print("Model type: {}".format(model_type))
    random.seed(123)
    top_df = pd.DataFrame([[0, 0, 0, 0]], columns=["TN", "FP", "FN", "TP"])
    all_results = []
    model_dict = {}
    
    list_of_args = []
    for i in range(0, 20):
        args = []
        for param in parameter_list:
            options = parameter_dict[param]
            args.append(random.choice(options))
        list_of_args.append(args)

    for args in list_of_args:
        print(args)
        pairs = list(zip(parameter_list, args))
        desc = "|".join([x + ":" + str(y) for (x,y) in pairs])

        model = get_model(model_type, args, random_seed)
        model_dict[desc] = model
        if model_type in ["LR", "KNN", "SVM", "DT", "RF", "GBC"]:
            model.fit(x_train.values, y_train)
            predictions = model.predict(x_test.values)
            cm = metrics.confusion_matrix(y_test, predictions)
        elif (model_type == "NN"):
            model.fit(x_train, y_train, epochs=30, batch_size=10, verbose=0)
            predictions = model.predict(x_test)
            cm = metrics.confusion_matrix(y_test, np.where(predictions < 0.5, 0, 1))
        else:
            print("Model type not supported")

        results = cm.ravel().tolist()
        results = results + args
        results.append(desc)
        all_results.append(results)

    cols = ["TN", "FP", "FN", "TP"] + parameter_list + ["desc"]
    df = pd.DataFrame(all_results, columns=cols)
    df["correct"] = df["TP"] + df["TN"]
    df["errors"] = df["FP"] + df["FN"]
    df["F1"] = df["TP"] / (df["TP"] + 0.5 * (df["FP"] + df["FN"])) 
    df = df.sort_values(by=[metric[0]], ascending=False)
    top_df = df.head(3)
    print(top_df[["TN", "FP", "FN", "TP"] + parameter_list + [metric[0]]])
        
    df2 = top_df[["desc"] + metric]
    df2 = df2.assign(type=model_type)
    df2 = df2[["type", "desc"] + metric]
    model_desc_filtered = {}
    for desc in df2["desc"]:
        model_desc_filtered[desc] = model_dict[desc]
    return df2, model_desc_filtered


def stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, model_type, metric = ["f1"], random_seed=123):
    print("Model type: {}".format(model_type))
    top_df = pd.DataFrame([[0]], columns=["f1"])

    for i in range(0, len(parameter_list)):
        print("Expanding parameter: {}".format(parameter_list[i]))
        all_results = []
        model_dict = {}
        for row in top_df.itertuples():
            for x in parameter_dict[parameter_list[i]]:
                args = []
                for j in range(1, i+1):
                    args.append(row[j])
                args.append(x)
                args = args + default_args[len(args):]
                print(args)

                pairs = list(zip(parameter_list, args))
                desc = "|".join([x + ":" + str(y) for (x,y) in pairs])
                
                model = get_model(model_type, args, random_seed)
                model_dict[desc] = model
                if model_type in ["LR", "KNN", "SVM", "DT", "RF", "GBC"]:
                    cv_results = cross_validate(model, features, targets, cv=5, scoring=metric)
                    cv_metrics = [np.mean(cv_results['test_{}'.format(x)]) for x in metric]
                elif (model_type == "NN"):
                    kf = KFold(n_splits=5, shuffle = True, random_state = 123)
                    kf.get_n_splits(features)
                    f1s = []; recalls = []; precisions = []
                    for train_index, test_index in kf.split(features):
                        x_train, x_test = features.iloc[train_index], features.iloc[test_index]
                        y_train, y_test = targets.iloc[train_index], targets.iloc[test_index]
                        model.fit(x_train, y_train, epochs=30, batch_size=10, verbose=0)
                        predictions = model.predict(x_test)
                        cm = metrics.confusion_matrix(y_test, np.where(predictions < 0.5, 0, 1))
                        matrix = cm.ravel().tolist() # ["TN", "FP", "FN", "TP"]
                        f1 = matrix[3] / (matrix[3] + 0.5 * (matrix[1] + matrix[2])); f1s.append(f1)
                        if ((matrix[1] + matrix[3]) > 0):
                            precision = matrix[3] / (matrix[1] + matrix[3])
                            precisions.append(precision)
                        if (matrix[0] + matrix[3] > 0):
                            recall = matrix[3] / (matrix[0] + matrix[3])
                            recalls.append(recall)
                    cv_results = {}
                    cv_results["test_f1"] = np.mean(f1s)
                    cv_results["test_recall"] = np.mean(recall)
                    cv_results["test_precision"] = np.mean(precision)
                    cv_metrics = [cv_results['test_{}'.format(x)] for x in metric]
                else:
                    print("Model type not supported")
                    
                results = []
                for j in range(0, i+1):
                    results.append(args[j])
                results.append(desc)
                results = results + cv_metrics
                all_results.append(results)

        cols = parameter_list[:i+1] + ["desc"] + metric
        df = pd.DataFrame(all_results, columns=cols)
        df = df.sort_values(by=[metric[0]], ascending=False)
        top_df = df.head(3)
        print(top_df[parameter_list[:i+1] + metric])
        
    df2 = top_df[["desc"] + metric]
    df2 = df2.assign(type=model_type)
    df2 = df2[["type", "desc"] + metric]
    model_desc_filtered = {}
    for desc in df2["desc"]:
        model_desc_filtered[desc] = model_dict[desc]
    return df2, model_desc_filtered

# Stepwise Parameter Exploration

In [313]:
metric = ["F1"]
random_seed = 321
best_candidates = pd.DataFrame([], columns=["type", "desc"] + metric)

start_time = datetime.now()
parameter_list = ["C", "solver"]
parameter_dict = {"C": [1, 2, 5, 10, 20, 50, 100, 200, 500],
                 "solver": [('newton-cg', 'l2'), ('lbfgs', 'l2'), ('liblinear', 'l2'), ('sag', 'l2'), ('saga', 'l2')]}
default_args = [1, ('newton-cg', 'l2')]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "LR", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("LR", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["neighbours", "dist_type", "weights"]
parameter_dict = {"neighbours": list(range(2,20)), 
                  "dist_type": ["euclidean", "manhattan", "chebyshev", "minkowski"], 
                  "weights": ["uniform", "distance"]}
default_args = [5, "euclidean", "uniform"]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "KNN", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("KNN", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["kernel", "C"]
parameter_dict = {"kernel": ["linear", "poly", "rbf", "sigmoid"], 
                  "C": [1, 2, 5, 10, 20]}
default_args = ["linear", 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "SVM", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("SVM", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["max_features", "criterion", "min_samples_leaf"]
parameter_dict = {"max_features": [5, 10, 20, 30, 40], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1,3,5,10,15]}
default_args = [40, "gini", 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "DT", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("DT", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["n_estimators", "criterion", "min_samples_leaf"]
parameter_dict = {"n_estimators": [5, 10, 20, 30, 40, 50], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1, 3, 5, 10, 15]}
default_args = [10, "gini", 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "RF", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("RF", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["n_est", "learn_rate", "min_samp_leaf"]
parameter_dict = {"n_est": [5, 10, 20, 30, 40, 50], 
                  "learn_rate": [0.1, 0.15, 0.2, 0.25, 0.3], 
                 "min_samp_leaf": [1, 3, 5, 10, 15]}
default_args = [10, 0.1, 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "GBC", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("GBC", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["1st_lay_nodes", "hid_lay_nodes", "hid_layers"]
parameter_dict = {"1st_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_layers": [0, 1, 2, 3, 4, 6, 8]}
default_args = [10, 10, 0]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "NN", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("NN", (end_time - start_time)))

Model type: LR
Expanding parameter: C
[1, ('newton-cg', 'l2')]
[2, ('newton-cg', 'l2')]
[5, ('newton-cg', 'l2')]
[10, ('newton-cg', 'l2')]
[20, ('newton-cg', 'l2')]
[50, ('newton-cg', 'l2')]
[100, ('newton-cg', 'l2')]
[200, ('newton-cg', 'l2')]
[500, ('newton-cg', 'l2')]
   TN  FP  FN  TP    C        F1
8  29  11   7  41  500  0.820000
6  27  13   8  40  100  0.792079
7  27  13   8  40  200  0.792079
Expanding parameter: solver
[500, ('newton-cg', 'l2')]
[500, ('lbfgs', 'l2')]
[500, ('liblinear', 'l2')]
[500, ('sag', 'l2')]
[500, ('saga', 'l2')]
[100, ('newton-cg', 'l2')]
[100, ('lbfgs', 'l2')]
[100, ('liblinear', 'l2')]
[100, ('sag', 'l2')]
[100, ('saga', 'l2')]
[200, ('newton-cg', 'l2')]
[200, ('lbfgs', 'l2')]
[200, ('liblinear', 'l2')]
[200, ('sag', 'l2')]
[200, ('saga', 'l2')]
   TN  FP  FN  TP    C           solver    F1
0  29  11   7  41  500  (newton-cg, l2)  0.82
1  29  11   7  41  500      (lbfgs, l2)  0.82
2  29  11   7  41  500  (liblinear, l2)  0.82
Elapsed duration to trai

[35, 5, 3]
[35, 5, 4]
[35, 5, 6]
[35, 5, 8]
[35, 15, 0]
[35, 15, 1]
[35, 15, 2]
[35, 15, 3]
[35, 15, 4]
[35, 15, 6]
[35, 15, 8]
[35, 20, 0]
[35, 20, 1]
[35, 20, 2]
[35, 20, 3]
[35, 20, 4]
[35, 20, 6]
[35, 20, 8]
    TN  FP  FN  TP  1st_lay_nodes  hid_lay_nodes  hid_layers        F1
19  18  22   1  47             35             20           6  0.803419
18  27  13   8  40             35             20           4  0.792079
20  26  14   8  40             35             20           8  0.784314
Elapsed duration to train NN is 0:02:00.887513.


In [314]:
best_candidates = best_candidates.sort_values(by=['F1'], ascending=False)
print(best_candidates.to_string(index=False))

type                                                  desc       F1
 GBC               n_est:20|learn_rate:0.3|min_samp_leaf:1 0.895833
 KNN     neighbours:5|dist_type:manhattan|weights:distance 0.895833
 KNN      neighbours:5|dist_type:manhattan|weights:uniform 0.888889
 GBC               n_est:30|learn_rate:0.3|min_samp_leaf:1 0.888889
 GBC              n_est:30|learn_rate:0.3|min_samp_leaf:15 0.886598
 KNN     neighbours:5|dist_type:chebyshev|weights:distance 0.880000
  RF  n_estimators:40|criterion:entropy|min_samples_leaf:1 0.872340
  RF  n_estimators:20|criterion:entropy|min_samples_leaf:1 0.872340
  RF     n_estimators:40|criterion:gini|min_samples_leaf:1 0.869565
 SVM                                       kernel:poly|C:2 0.868687
 SVM                                       kernel:rbf|C:20 0.865979
 SVM                                       kernel:rbf|C:10 0.862745
  DT max_features:20|criterion:entropy|min_samples_leaf:15 0.857143
  DT max_features:20|criterion:entropy|min_sampl

In [317]:
metric = ["correct", "TP"]
best_candidates = pd.DataFrame([], columns=["type", "desc"] + metric)


parameter_list = ["C", "solver"]
parameter_dict = {"C": [1, 2, 5, 10, 20, 50, 100, 200, 500],
                 "solver": [('newton-cg', 'l2'), ('lbfgs', 'l2'), ('liblinear', 'l2'), ('sag', 'l2'), ('saga', 'l2')]}
default_args = [1, ('newton-cg', 'l2')]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "LR", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["neighbours", "dist_type", "weights"]
parameter_dict = {"neighbours": list(range(2,20)), 
                  "dist_type": ["euclidean", "manhattan", "chebyshev", "minkowski"], 
                  "weights": ["uniform", "distance"]}
default_args = [5, "euclidean", "uniform"]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "KNN", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["kernel", "C"]
parameter_dict = {"kernel": ["linear", "poly", "rbf", "sigmoid"], 
                  "C": [1, 2, 5, 10, 20]}
default_args = ["linear", 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "SVM", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["max_features", "criterion", "min_samples_leaf"]
parameter_dict = {"max_features": [5, 10, 20, 30, 40], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1,3,5,10,15]}
default_args = [40, "gini", 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "DT", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["n_estimators", "criterion", "min_samples_leaf"]
parameter_dict = {"n_estimators": [5, 10, 20, 30, 40, 50], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1, 3, 5, 10, 15]}
default_args = [10, "gini", 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "RF", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["n_est", "learn_rate", "min_samp_leaf"]
parameter_dict = {"n_est": [5, 10, 20, 30, 40, 50], 
                  "learn_rate": [0.1, 0.15, 0.2, 0.25, 0.3], 
                 "min_samp_leaf": [1, 3, 5, 10, 15]}
default_args = [10, 0.1, 1]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "GBC", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["1st_lay_nodes", "hid_lay_nodes", "hid_layers"]
parameter_dict = {"1st_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_layers": [0, 1, 2, 3, 4, 6, 8]}
default_args = [10, 10, 0]
new_candidates, _ = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "NN", metric)
best_candidates = pd.concat([best_candidates, new_candidates])

Model type: LR
Expanding parameter: C
[1, ('newton-cg', 'l2')]
[2, ('newton-cg', 'l2')]
[5, ('newton-cg', 'l2')]
[10, ('newton-cg', 'l2')]
[20, ('newton-cg', 'l2')]
[50, ('newton-cg', 'l2')]
[100, ('newton-cg', 'l2')]
[200, ('newton-cg', 'l2')]
[500, ('newton-cg', 'l2')]
   TN  FP  FN  TP    C  correct
8  29  11   7  41  500       70
6  27  13   8  40  100       67
7  27  13   8  40  200       67
Expanding parameter: solver
[500, ('newton-cg', 'l2')]
[500, ('lbfgs', 'l2')]
[500, ('liblinear', 'l2')]
[500, ('sag', 'l2')]
[500, ('saga', 'l2')]
[100, ('newton-cg', 'l2')]
[100, ('lbfgs', 'l2')]
[100, ('liblinear', 'l2')]
[100, ('sag', 'l2')]
[100, ('saga', 'l2')]
[200, ('newton-cg', 'l2')]
[200, ('lbfgs', 'l2')]
[200, ('liblinear', 'l2')]
[200, ('sag', 'l2')]
[200, ('saga', 'l2')]
   TN  FP  FN  TP    C           solver  correct
0  29  11   7  41  500  (newton-cg, l2)       70
1  29  11   7  41  500      (lbfgs, l2)       70
2  29  11   7  41  500  (liblinear, l2)       70
Model type: KNN


In [318]:
best_candidates = best_candidates.sort_values(by=["correct"], ascending=False)
print(best_candidates.to_string(index=False))

type                                                  desc correct TP
 KNN     neighbours:5|dist_type:manhattan|weights:distance      78 43
  RF  n_estimators:50|criterion:entropy|min_samples_leaf:5      78 44
 GBC               n_est:20|learn_rate:0.3|min_samp_leaf:1      77 42
 KNN      neighbours:5|dist_type:manhattan|weights:uniform      77 44
 GBC              n_est:30|learn_rate:0.3|min_samp_leaf:15      77 43
  RF  n_estimators:30|criterion:entropy|min_samples_leaf:5      77 43
 GBC               n_est:30|learn_rate:0.3|min_samp_leaf:1      76 43
 KNN     neighbours:3|dist_type:euclidean|weights:distance      75 42
 SVM                                       kernel:poly|C:2      75 43
 SVM                                      kernel:poly|C:20      75 39
 SVM                                       kernel:rbf|C:20      75 42
  RF  n_estimators:30|criterion:entropy|min_samples_leaf:3      75 41
  DT max_features:30|criterion:entropy|min_samples_leaf:15      73 37
  DT  max_features:3

# Random Parameter Exploration

In [315]:
metric = ["F1"]
best_candidates = pd.DataFrame([], columns=["type", "desc"] + metric)


parameter_list = ["C", "solver"]
parameter_dict = {"C": [1, 2, 5, 10, 20, 50, 100, 200, 500],
                 "solver": [('newton-cg', 'l2'), ('lbfgs', 'l2'), ('liblinear', 'l2'), ('sag', 'l2'), ('saga', 'l2')]}
new_candidates, _ = random_parameter_exploration(parameter_list, parameter_dict, "LR", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["neighbours", "dist_type", "weights"]
parameter_dict = {"neighbours": list(range(2,20)), 
                  "dist_type": ["euclidean", "manhattan", "chebyshev", "minkowski"], 
                  "weights": ["uniform", "distance"]}
new_candidates, _  = random_parameter_exploration(parameter_list, parameter_dict, "KNN", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["kernel", "C"]
parameter_dict = {"kernel": ["linear", "poly", "rbf", "sigmoid"], 
                  "C": [1, 2, 5, 10, 20]}
new_candidates, _  = random_parameter_exploration(parameter_list, parameter_dict, "SVM", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["max_features", "criterion", "min_samples_leaf"]
parameter_dict = {"max_features": [5, 10, 20, 30, 40], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1,3,5,10,15]}
new_candidate, _ = random_parameter_exploration(parameter_list, parameter_dict, "DT", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["n_estimators", "criterion", "min_samples_leaf"]
parameter_dict = {"n_estimators": [5, 10, 20, 30, 40, 50], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1, 3, 5, 10, 15]}
new_candidates, _  = random_parameter_exploration(parameter_list, parameter_dict, "RF", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["n_est", "learn_rate", "min_samp_leaf"]
parameter_dict = {"n_est": [5, 10, 20, 30, 40, 50], 
                  "learn_rate": [0.1, 0.15, 0.2, 0.25, 0.3], 
                 "min_samp_leaf": [1, 3, 5, 10, 15]}
new_candidates, _  = random_parameter_exploration(parameter_list, parameter_dict, "GBC", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["1st_lay_nodes", "hid_lay_nodes", "hid_layers"]
parameter_dict = {"1st_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_layers": [0, 1, 2, 3, 4, 6, 8]}
new_candidates, _  = random_parameter_exploration(parameter_list, parameter_dict, "NN", metric)
best_candidates = pd.concat([best_candidates, new_candidates])

Model type: LR
[1, ('liblinear', 'l2')]
[2, ('sag', 'l2')]
[20, ('newton-cg', 'l2')]
[1, ('sag', 'l2')]
[500, ('saga', 'l2')]
[50, ('liblinear', 'l2')]
[1, ('lbfgs', 'l2')]
[5, ('liblinear', 'l2')]
[500, ('liblinear', 'l2')]
[10, ('lbfgs', 'l2')]
[1, ('sag', 'l2')]
[2, ('saga', 'l2')]
[100, ('newton-cg', 'l2')]
[1, ('liblinear', 'l2')]
[200, ('newton-cg', 'l2')]
[1, ('newton-cg', 'l2')]
[5, ('lbfgs', 'l2')]
[1, ('liblinear', 'l2')]
[100, ('saga', 'l2')]
[200, ('liblinear', 'l2')]
    TN  FP  FN  TP    C           solver        F1
8   29  11   7  41  500  (liblinear, l2)  0.820000
4   28  12   7  41  500       (saga, l2)  0.811881
19  27  13   8  40  200  (liblinear, l2)  0.792079
Model type: KNN
[3, 'chebyshev', 'uniform']
[15, 'chebyshev', 'uniform']
[3, 'minkowski', 'distance']
[12, 'euclidean', 'uniform']
[6, 'chebyshev', 'distance']
[9, 'manhattan', 'uniform']
[15, 'euclidean', 'distance']
[4, 'euclidean', 'distance']
[16, 'euclidean', 'uniform']
[4, 'manhattan', 'uniform']
[2, 'ch

In [316]:
best_candidates = best_candidates.sort_values(by=['F1'], ascending=False)
print(best_candidates.to_string(index=False))

type                                                 desc       F1
  RF n_estimators:50|criterion:entropy|min_samples_leaf:5 0.897959
 GBC              n_est:5|learn_rate:0.25|min_samp_leaf:1 0.891089
 KNN    neighbours:4|dist_type:euclidean|weights:distance 0.886598
 KNN    neighbours:4|dist_type:minkowski|weights:distance 0.886598
  RF n_estimators:40|criterion:entropy|min_samples_leaf:5 0.886598
  RF n_estimators:40|criterion:entropy|min_samples_leaf:5 0.886598
 GBC             n_est:50|learn_rate:0.3|min_samp_leaf:15 0.875000
 GBC              n_est:30|learn_rate:0.1|min_samp_leaf:5 0.875000
 KNN    neighbours:7|dist_type:chebyshev|weights:distance 0.871287
 SVM                                      kernel:poly|C:2 0.868687
 SVM                                      kernel:poly|C:2 0.868687
 SVM                                      kernel:poly|C:5 0.857143
 SVM                                      kernel:poly|C:5 0.857143
 SVM                                       kernel:rbf|C:5 0.85

# Cross validation

In [304]:
metric = ["f1"]
best_candidates = pd.DataFrame([], columns=["type", "desc"] + metric)


parameter_list = ["C", "solver"]
parameter_dict = {"C": [1, 2, 5, 10, 20, 50, 100, 200, 500],
                 "solver": [('newton-cg', 'l2'), ('lbfgs', 'l2'), ('liblinear', 'l2'), ('sag', 'l2'), ('saga', 'l2')]}
default_args = [1, ('newton-cg', 'l2')]
new_candidates, _ = stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, "LR", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["neighbours", "dist_type", "weights"]
parameter_dict = {"neighbours": list(range(2,20)), 
                  "dist_type": ["euclidean", "manhattan", "chebyshev", "minkowski"], 
                  "weights": ["uniform", "distance"]}
default_args = [5, "euclidean", "uniform"]
new_candidates, _ = stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, "KNN", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["kernel", "C"]
parameter_dict = {"kernel": ["linear", "poly", "rbf", "sigmoid"], 
                  "C": [1, 2, 5, 10, 20]}
default_args = ["linear", 1]
new_candidates, _ = stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, "SVM", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["max_features", "criterion", "min_samples_leaf"]
parameter_dict = {"max_features": [5, 10, 20, 30, 40], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1,3,5,10,15]}
default_args = [40, "gini", 1]
new_candidates, _ = stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, "DT", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["n_estimators", "criterion", "min_samples_leaf"]
parameter_dict = {"n_estimators": [5, 10, 20, 30, 40, 50], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1, 3, 5, 10, 15]}
default_args = [10, "gini", 1]
new_candidates, _ = stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, "RF", metric)
best_candidates = pd.concat([best_candidates, new_candidates])


parameter_list = ["n_est", "learn_rate", "min_samp_leaf"]
parameter_dict = {"n_est": [5, 10, 20, 30, 40, 50], 
                  "learn_rate": [0.1, 0.15, 0.2, 0.25, 0.3], 
                 "min_samp_leaf": [1, 3, 5, 10, 15]}
default_args = [10, 0.1, 1]
new_candidates, _ = stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, "GBC", metric)
best_candidates = pd.concat([best_candidates, new_candidates])

parameter_list = ["1st_lay_nodes", "hid_lay_nodes", "hid_layers"]
parameter_dict = {"1st_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_layers": [0, 1, 2, 3, 4, 6, 8]}
default_args = [10, 10, 0]
new_candidates, _ = stepwise_parameter_exploration_cv(parameter_list, parameter_dict, default_args, "NN", metric)
best_candidates = pd.concat([best_candidates, new_candidates])

Model type: LR
Expanding parameter: C
[1, ('newton-cg', 'l2')]
[2, ('newton-cg', 'l2')]
[5, ('newton-cg', 'l2')]
[10, ('newton-cg', 'l2')]
[20, ('newton-cg', 'l2')]
[50, ('newton-cg', 'l2')]
[100, ('newton-cg', 'l2')]
[200, ('newton-cg', 'l2')]
[500, ('newton-cg', 'l2')]
    C        f1
0   1  0.750466
1   2  0.749350
3  10  0.749294
Expanding parameter: solver
[1, ('newton-cg', 'l2')]
[1, ('lbfgs', 'l2')]
[1, ('liblinear', 'l2')]
[1, ('sag', 'l2')]
[1, ('saga', 'l2')]
[2, ('newton-cg', 'l2')]
[2, ('lbfgs', 'l2')]
[2, ('liblinear', 'l2')]
[2, ('sag', 'l2')]
[2, ('saga', 'l2')]
[10, ('newton-cg', 'l2')]
[10, ('lbfgs', 'l2')]
[10, ('liblinear', 'l2')]
[10, ('sag', 'l2')]
[10, ('saga', 'l2')]
   C           solver        f1
0  1  (newton-cg, l2)  0.750466
1  1      (lbfgs, l2)  0.750466
3  1        (sag, l2)  0.750466
Model type: KNN
Expanding parameter: neighbours
[2, 'euclidean', 'uniform']
[3, 'euclidean', 'uniform']
[4, 'euclidean', 'uniform']
[5, 'euclidean', 'uniform']
[6, 'euclidea

In [305]:
best_candidates = best_candidates.sort_values(by=['f1'], ascending=False)
print(best_candidates.to_string(index=False))

type                                                  desc       f1
  NN        1st_lay_nodes:40|hid_lay_nodes:15|hid_layers:6 0.869929
  NN        1st_lay_nodes:40|hid_lay_nodes:10|hid_layers:6 0.867583
  NN        1st_lay_nodes:40|hid_lay_nodes:10|hid_layers:2 0.860956
 KNN      neighbours:5|dist_type:euclidean|weights:uniform 0.782899
 KNN      neighbours:5|dist_type:minkowski|weights:uniform 0.782899
 KNN      neighbours:9|dist_type:chebyshev|weights:uniform 0.781350
 SVM                                       kernel:poly|C:2 0.763185
 SVM                                       kernel:poly|C:5 0.761657
 SVM                                       kernel:rbf|C:20 0.761553
  RF  n_estimators:50|criterion:entropy|min_samples_leaf:3 0.756585
  DT max_features:30|criterion:entropy|min_samples_leaf:10 0.754791
  RF      n_estimators:5|criterion:gini|min_samples_leaf:3 0.752343
  LR                        C:1|solver:('newton-cg', 'l2') 0.750466
  LR                            C:1|solver:('lbf

# Final evaluation datasets

In [257]:
targets = data['ActivityTag']
features = data.iloc[:, 11:51].copy()
x_traintest, x_test_final, y_traintest, y_test_final = train_test_split(features, targets, test_size=0.2, random_state=123)
x_train, x_test, y_train, y_test = train_test_split(x_traintest, y_traintest, test_size=0.25, random_state=123)

In [307]:
metric = ["F1"]
random_seed = 321
best_candidates = pd.DataFrame([], columns=["type", "desc"] + metric)
mixed_model_dict = {}

start_time = datetime.now()
parameter_list = ["C", "solver"]
parameter_dict = {"C": [1, 2, 5, 10, 20, 50, 100, 200, 500],
                 "solver": [('newton-cg', 'l2'), ('lbfgs', 'l2'), ('liblinear', 'l2'), ('sag', 'l2'), ('saga', 'l2')]}
default_args = [1, ('newton-cg', 'l2')]
new_candidates, model_dict = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "LR", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
mixed_model_dict["LR"] = model_dict
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("LR", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["neighbours", "dist_type", "weights"]
parameter_dict = {"neighbours": list(range(2,20)), 
                  "dist_type": ["euclidean", "manhattan", "chebyshev", "minkowski"], 
                  "weights": ["uniform", "distance"]}
default_args = [5, "euclidean", "uniform"]
new_candidates, model_dict = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "KNN", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
mixed_model_dict["KNN"] = model_dict
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("KNN", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["kernel", "C"]
parameter_dict = {"kernel": ["linear", "poly", "rbf", "sigmoid"], 
                  "C": [1, 2, 5, 10, 20]}
default_args = ["linear", 1]
new_candidates, model_dict = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "SVM", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
mixed_model_dict["SVM"] = model_dict
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("SVM", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["max_features", "criterion", "min_samples_leaf"]
parameter_dict = {"max_features": [5, 10, 20, 30, 40], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1,3,5,10,15]}
default_args = [40, "gini", 1]
new_candidates, model_dict = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "DT", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
mixed_model_dict["DT"] = model_dict
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("DT", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["n_estimators", "criterion", "min_samples_leaf"]
parameter_dict = {"n_estimators": [5, 10, 20, 30, 40, 50], 
                  "criterion":["gini", "entropy"],
                 "min_samples_leaf": [1, 3, 5, 10, 15]}
default_args = [10, "gini", 1]
new_candidates, model_dict = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "RF", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
mixed_model_dict["RF"] = model_dict
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("RF", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["n_est", "learn_rate", "min_samp_leaf"]
parameter_dict = {"n_est": [5, 10, 20, 30, 40, 50], 
                  "learn_rate": [0.1, 0.15, 0.2, 0.25, 0.3], 
                 "min_samp_leaf": [1, 3, 5, 10, 15]}
default_args = [10, 0.1, 1]
new_candidates, model_dict = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "GBC", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
mixed_model_dict["GBC"] = model_dict
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("GBC", (end_time - start_time)))


start_time = datetime.now()
parameter_list = ["1st_lay_nodes", "hid_lay_nodes", "hid_layers"]
parameter_dict = {"1st_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_lay_nodes": [5, 10, 15, 20, 25, 30, 35, 40], 
                    "hid_layers": [0, 1, 2, 3, 4, 6, 8]}
default_args = [10, 10, 0]
new_candidates, model_dict = stepwise_parameter_exploration(parameter_list, parameter_dict, default_args, "NN", metric, random_seed)
best_candidates = pd.concat([best_candidates, new_candidates])
mixed_model_dict["NN"] = model_dict
end_time = datetime.now()
print("Elapsed duration to train {} is {}.".format("NN", (end_time - start_time)))

Model type: LR
Expanding parameter: C
[1, ('newton-cg', 'l2')]
[2, ('newton-cg', 'l2')]
[5, ('newton-cg', 'l2')]
[10, ('newton-cg', 'l2')]
[20, ('newton-cg', 'l2')]
[50, ('newton-cg', 'l2')]
[100, ('newton-cg', 'l2')]
[200, ('newton-cg', 'l2')]
[500, ('newton-cg', 'l2')]
   TN  FP  FN  TP    C        F1
8  29  11   7  41  500  0.820000
6  27  13   8  40  100  0.792079
7  27  13   8  40  200  0.792079
Expanding parameter: solver
[500, ('newton-cg', 'l2')]
[500, ('lbfgs', 'l2')]
[500, ('liblinear', 'l2')]
[500, ('sag', 'l2')]
[500, ('saga', 'l2')]
[100, ('newton-cg', 'l2')]
[100, ('lbfgs', 'l2')]
[100, ('liblinear', 'l2')]
[100, ('sag', 'l2')]
[100, ('saga', 'l2')]
[200, ('newton-cg', 'l2')]
[200, ('lbfgs', 'l2')]
[200, ('liblinear', 'l2')]
[200, ('sag', 'l2')]
[200, ('saga', 'l2')]
   TN  FP  FN  TP    C           solver    F1
0  29  11   7  41  500  (newton-cg, l2)  0.82
1  29  11   7  41  500      (lbfgs, l2)  0.82
2  29  11   7  41  500  (liblinear, l2)  0.82
Elapsed duration to trai

[35, 5, 3]
[35, 5, 4]
[35, 5, 6]
[35, 5, 8]
[35, 15, 0]
[35, 15, 1]
[35, 15, 2]
[35, 15, 3]
[35, 15, 4]
[35, 15, 6]
[35, 15, 8]
[35, 20, 0]
[35, 20, 1]
[35, 20, 2]
[35, 20, 3]
[35, 20, 4]
[35, 20, 6]
[35, 20, 8]
    TN  FP  FN  TP  1st_lay_nodes  hid_lay_nodes  hid_layers        F1
19  18  22   1  47             35             20           6  0.803419
18  27  13   8  40             35             20           4  0.792079
20  26  14   8  40             35             20           8  0.784314
Elapsed duration to train NN is 0:02:00.099473.


In [309]:
def parse_args(desc):
    args = []
    arg_vals = desc.split("|")
    for arg_val in arg_vals:
        val = arg_val.split(":")[1]
        if val.startswith("("):
            val = val[1:-1]
            val = val.replace("'", "").replace(" ", "")
            val = tuple(val.split(","))
        args.append(val)
    return args

metric=["F1"]
all_results=[]
top_df = pd.DataFrame([[0, 0, 0, 0]], columns=["TN", "FP", "FN", "TP"])
for model_type in mixed_model_dict.keys():
    for desc in mixed_model_dict[model_type].keys():
        args = parse_args(desc)
        model = mixed_model_dict[model_type][desc]
        if model_type in ["LR", "KNN", "SVM", "DT", "RF", "GBC"]:
            predictions = model.predict(x_test_final.values)
            cm = metrics.confusion_matrix(y_test_final, predictions)
        elif (model_type == "NN"):
            predictions = model.predict(x_test_final)
            cm = metrics.confusion_matrix(y_test_final, np.where(predictions < 0.5, 0, 1))
        else:
            print("Model type not supported")
            
        results = cm.ravel().tolist()
        results.append(model_type)
        results.append(desc)
        all_results.append(results)

cols = ["TN", "FP", "FN", "TP", "type", "desc"]
df = pd.DataFrame(all_results, columns=cols)
df["correct"] = df["TP"] + df["TN"]
df["errors"] = df["FP"] + df["FN"]
df["F1"] = df["TP"] / (df["TP"] + 0.5 * (df["FP"] + df["FN"])) 
df = df.sort_values(by=[metric[0]], ascending=False)
print(df[["type", "desc"] + [metric[0]]].to_string(index=False))

type                                                  desc       F1
 SVM                                       kernel:rbf|C:10 0.901961
 SVM                                       kernel:rbf|C:20 0.900000
 GBC               n_est:20|learn_rate:0.3|min_samp_leaf:1 0.897959
 GBC              n_est:30|learn_rate:0.3|min_samp_leaf:15 0.893204
  RF  n_estimators:20|criterion:entropy|min_samples_leaf:1 0.884211
 KNN      neighbours:5|dist_type:manhattan|weights:uniform 0.882353
 SVM                                       kernel:poly|C:2 0.871287
  RF     n_estimators:40|criterion:gini|min_samples_leaf:1 0.868687
  RF  n_estimators:40|criterion:entropy|min_samples_leaf:1 0.868687
  NN        1st_lay_nodes:35|hid_lay_nodes:20|hid_layers:4 0.867925
  NN        1st_lay_nodes:35|hid_lay_nodes:20|hid_layers:8 0.862745
 KNN     neighbours:5|dist_type:manhattan|weights:distance 0.860000
 GBC               n_est:30|learn_rate:0.3|min_samp_leaf:1 0.857143
 KNN     neighbours:5|dist_type:chebyshev|weight