# Ensemble Classifiers: Expanded Featureset

In [64]:
import sys
sys.path.append('../') # Make parent folder visible
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, fbeta_score

from sklearn import svm, linear_model, neighbors, ensemble, naive_bayes, \
    neural_network, tree, gaussian_process, discriminant_analysis

from preprocess import load_data

In [60]:
BETA = 5 # 0-1 favors precision, >1 (up to infinity) favors recall
# CLASS_WEIGHT = "balanced"

In [3]:
x, y, ids = load_data.load_expanded(one_hot=False, fill_mode='mean')

In [4]:
# Explain TN, FP, FN, TP
def compute_stats(y_pred, y_test):
    if y_test.ndim > 1:
        y_results = np.column_stack((y_test[:, 1], y_pred))
    else:
        y_results = np.column_stack((y_test, y_pred))
    y_arr = np.dtype((np.void, y_results.dtype.itemsize * y_results.shape[1]))
    contigview = np.ascontiguousarray(y_results).view(y_arr)
    return np.unique(contigview, return_counts=True)[1].tolist()

# Explain TN, FP, FN, TP
def explain_stats(stats):
    fc_total = stats[0] + stats[1]
    kd_total = stats[2] + stats[3]
    fc_as_fc = (stats[0] / fc_total) * 100
    print("FC Classified as FC: " + str(stats[0]) + ", (" + str(fc_as_fc) + " %)")
    fc_as_kd = (stats[1] / fc_total) * 100
    print("FC Classified as KD: " + str(stats[1]) + ", (" + str(fc_as_kd) + " %)")
    kd_as_fc = (stats[2] / kd_total) * 100
    print("KD Classified as FC: " + str(stats[2]) + ", (" + str(kd_as_fc) + " %)")
    kd_as_kd = (stats[3] / kd_total) * 100
    print("KD Classified as KD: " + str(stats[3]) + ", (" + str(kd_as_kd) + " %)")

# Train and evaluate model, print out results
def test_model(model, x, y):
    stats_arr = []
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=90007)
    for train_idx, test_idx in kf.split(x, y):
        x_train, x_test, y_train, y_test = x[train_idx], x[test_idx], y[train_idx], y[test_idx]
        y_pred = model.train_test(x_train, x_test, y_train, y_test)
        stats_arr.append(compute_stats(y_pred, y_test))
    explain_stats(np.mean(stats_arr, axis=0))

## Single SVM Model

In [40]:
# ScikitModel wrapper class
class ScikitModel:
    def __init__(self, skmodel, params, verbose=False):
        self.skmodel = skmodel
        self.cv_scorer = make_scorer(fbeta_score, beta=BETA) # optimize for fbeta_score
        self.paramsearch = GridSearchCV(self.skmodel, params, cv=5, scoring=self.cv_scorer, verbose=True)
        self.verbose = verbose

    def train_test(self, x_train, x_test, y_train, y_test):
        params = self.skmodel.get_params(deep=True)
        # print(params)
        self.paramsearch.fit(x_train, y_train)
        if self.verbose == True:
            print("Best params: ", self.paramsearch.best_params_)
        y_pred = self.paramsearch.predict(x_test)
        return y_pred

In [32]:
# Grid search params
params = {
    'C': [.1, 1.0, 10, 100],
    #     'C': np.logspace(-3, 3, 3),
#     'gamma': np.logspace(-3, 3, 10),
    'kernel': ['linear', 'rbf', 'poly']
}

# Test model! 5-fold CV with hyperparameter optimization
clf = ScikitModel(svm.SVC(), params, verbose=1)
test_model(clf, x, y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    7.0s finished


Best params:  {'C': 0.1, 'kernel': 'linear'}
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    8.7s finished


Best params:  {'C': 1.0, 'kernel': 'linear'}
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    5.6s finished


Best params:  {'C': 1.0, 'kernel': 'poly'}
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    6.7s finished


Best params:  {'C': 1.0, 'kernel': 'rbf'}
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params:  {'C': 10, 'kernel': 'linear'}
FC Classified as FC: 112.0, (90.0321543408 %)
FC Classified as KD: 12.4, (9.96784565916 %)
KD Classified as FC: 8.0, (4.87804878049 %)
KD Classified as KD: 156.0, (95.1219512195 %)


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   10.4s finished


## Test out Various Models

In [68]:
# tweak this
test_model(ScikitModel(neighbors.KNeighborsClassifier(n_neighbors=7), params={}, verbose=1), x, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


Best params:  {}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


Best params:  {}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


Best params:  {}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


Best params:  {}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best params:  {}
FC Classified as FC: 109.6, (88.1028938907 %)
FC Classified as KD: 14.8, (11.8971061093 %)
KD Classified as FC: 6.8, (4.14634146341 %)
KD Classified as KD: 157.2, (95.8536585366 %)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


## KNN-SVM-LogReg Ensemble

In [61]:
# 3-way voting ensemble: KNN, SVM, LR
class SKEnsembleModel:
    def __init__(self, knn_params, svm_params, logreg_params, verbose=False):
        self.knn = neighbors.KNeighborsClassifier()
        self.svm = svm.SVC(probability=True)
        self.logreg = linear_model.LogisticRegression()
        self.cv_scorer = make_scorer(fbeta_score, beta=BETA) # optimize for fbeta_score
        self.knn_paramsearch = GridSearchCV(self.knn, knn_params, cv=5, scoring=self.cv_scorer, verbose=verbose)
        self.svm_paramsearch = GridSearchCV(self.svm, svm_params, cv=5, scoring=self.cv_scorer, verbose=verbose)
        self.logreg_paramsearch = GridSearchCV(self.logreg, logreg_params, cv=5, scoring=self.cv_scorer, verbose=verbose)
        self.verbose = verbose

    def train_test(self, x_train, x_test, y_train, y_test):
#         params = self.skmodel.get_params(deep=True)
        # print(params)
        self.knn_paramsearch.fit(x_train, y_train)
        self.svm_paramsearch.fit(x_train, y_train)
        self.logreg_paramsearch.fit(x_train, y_train)
        if self.verbose == True:
            print("Best KNN params: ", self.knn_paramsearch.best_params_)
            print("Best SVM params: ", self.svm_paramsearch.best_params_)
            print("Best LogReg params: ", self.logreg_paramsearch.best_params_)
        knn_pred = self.knn_paramsearch.predict_proba(x_test)[:,1]
        svm_pred = self.svm_paramsearch.predict_proba(x_test)[:,1]
        logreg_pred = self.logreg_paramsearch.predict_proba(x_test)[:,1]
        y_pred = np.mean(np.vstack((knn_pred, svm_pred, logreg_pred)), axis=0) # stack and average
        y_pred_binary = np.around(y_pred) # binarize
#         print("y_pred shape: ", y_pred.shape)
#         print("y_pred: ", y_pred)
        return y_pred_binary

In [62]:
# Grid search params
knn_params = {
    'n_neighbors':[3, 5, 7, 9],
}

svm_params = {
    'C': [.1, 1.0, 10, 100],
    #     'C': np.logspace(-3, 3, 3),
#     'gamma': np.logspace(-3, 3, 10),
    'kernel': ['linear', 'rbf', 'poly']
}

logreg_params = {
    'C': [0.1, 1.0, 10, 100],
}

# Test model! 5-fold CV with hyperparameter optimization
clf = SKEnsembleModel(knn_params, svm_params, logreg_params, verbose=1)
print()
test_model(clf, x, y)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.9s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   32.9s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.3s finished


Best KNN params:  {'n_neighbors': 5}
Best SVM params:  {'C': 0.1, 'kernel': 'poly'}
Best LogReg params:  {'C': 1.0}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.3s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   42.8s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best KNN params:  {'n_neighbors': 9}
Best SVM params:  {'C': 0.1, 'kernel': 'poly'}
Best LogReg params:  {'C': 10}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.1s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   32.1s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.2s finished


Best KNN params:  {'n_neighbors': 7}
Best SVM params:  {'C': 0.1, 'kernel': 'poly'}
Best LogReg params:  {'C': 0.1}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.9s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   40.2s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best KNN params:  {'n_neighbors': 9}
Best SVM params:  {'C': 0.1, 'kernel': 'poly'}
Best LogReg params:  {'C': 0.1}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.8s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   32.2s finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best KNN params:  {'n_neighbors': 9}
Best SVM params:  {'C': 0.1, 'kernel': 'poly'}
Best LogReg params:  {'C': 0.1}
FC Classified as FC: 112.8, (90.6752411576 %)
FC Classified as KD: 11.6, (9.32475884244 %)
KD Classified as FC: 8.0, (4.87804878049 %)
KD Classified as KD: 156.0, (95.1219512195 %)


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.1s finished
