# Ensemble Classifiers: Expanded Featureset
---

### Pre-Training Setup

In [1]:
import sys
sys.path.append('../') # Make parent folder visible
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, fbeta_score, confusion_matrix, roc_curve

from sklearn import svm, linear_model, neighbors, ensemble, naive_bayes, \
    neural_network, tree, gaussian_process, discriminant_analysis
    
import scipy.stats as sp_stats

from preprocess import load_data

In [2]:
x, y, ids = load_data.load_expanded(one_hot=False, fill_mode='mean')

### Helper Functions/Classes

In [15]:
# Compute TN, FP, FN, TP
def compute_stats(y_pred, y_test):
    confusion = confusion_matrix(y_test, y_pred) # calculate confusion matrix
    return confusion.flatten()

# Explain TN, FP, FN, TP
def explain_stats(stats):
    fc_total = stats[0] + stats[1]
    kd_total = stats[2] + stats[3]
    fc_as_fc = (stats[0] / fc_total) * 100
    print("FC Classified as FC: " + str(stats[0]) + ", (" + str(fc_as_fc) + " %)")
    fc_as_kd = (stats[1] / fc_total) * 100
    print("FC Classified as KD: " + str(stats[1]) + ", (" + str(fc_as_kd) + " %)")
    kd_as_fc = (stats[2] / kd_total) * 100
    print("KD Classified as FC: " + str(stats[2]) + ", (" + str(kd_as_fc) + " %)")
    kd_as_kd = (stats[3] / kd_total) * 100
    print("KD Classified as KD: " + str(stats[3]) + ", (" + str(kd_as_kd) + " %)")

# Train and evaluate model using K-Fold CV, print out results
    # TODO: return ROC curves from each split
def test_model(model, x, y, threshold=0.5):
    stats_arr = []
    best_scores = []
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=90007)
    for train_idx, test_idx in kf.split(x, y):
        x_train, x_test, y_train, y_test = x[train_idx], x[test_idx], y[train_idx], y[test_idx]
        best_score = model.train(x_train, y_train)
        best_scores.append(best_score)
        y_pred = model.predict(x_test, threshold=threshold)
        stats_arr.append(compute_stats(y_pred, y_test))
    print('CV Confusion: ', [stats.tolist() for stats in stats_arr])
    print('Best CV scores: ', np.around(best_scores, decimals=4))
    print('Avg best scores: ', np.mean(best_scores))
    explain_stats(np.mean(stats_arr, axis=0))
    
    
# ScikitModel wrapper class
class ScikitModel:
    def __init__(self, skmodel, params, random_search=True, n_iter=10, scoring='roc_auc', beta=1.0, verbose=False):
        self.skmodel = skmodel
        self.cv_scorer = 'roc_auc' if scoring=='roc_auc' else make_scorer(fbeta_score, beta=beta)
        self.verbose = verbose
        if random_search == True: # Randomized grid search
            self.paramsearch = RandomizedSearchCV(self.skmodel, params, cv=5, 
                                        n_iter=n_iter,
                                        scoring=self.cv_scorer, 
                                        verbose=verbose)
        else: # Regular grid search
            self.paramsearch = GridSearchCV(self.skmodel, params, cv=5,
                                        scoring=self.cv_scorer, 
                                        verbose=verbose)
        
    # Run CV fit on x_train, y_train
    def train(self, x_train, y_train):
        self.paramsearch.fit(x_train, y_train)
        if self.verbose == True:
            print('Best params: ', self.paramsearch.best_params_)
            print('Best score: ', self.paramsearch.best_score_)
        return self.paramsearch.best_score_ # return ROC-AUC or f-beta
    
    # Predict on x_test, return binary y_pred
    def predict(self, x_test, threshold=0.5):
        y_prob = self.paramsearch.predict_proba(x_test)[:, 1] # probability of KD
        y_pred = np.array(y_prob >= threshold).astype(np.int32) # thresholding
        return y_pred

    # Train on x_train and y_train, and predict on x_test
    def train_test(self, x_train, x_test, y_train, y_test, threshold=0.5):
        self.train(x_train, y_train)
        return self.predict(x_test, threshold=threshold)

### Test out Candidate Learners

In [10]:
test_model(ScikitModel(svm.SVC(probability=True), 
                       {
                           'C': np.logspace(-3, 3, 100),
                           'gamma': np.logspace(-3, 3, 100),
                           'kernel': ['linear', 'rbf', 'poly']
                       },
                       random=True,
                       n_iter=50,
                       scoring='roc_auc',
                       verbose=True),
           x, y,
          threshold=0.45)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  3.0min finished


Best params:  {'kernel': 'rbf', 'gamma': 0.0015199110829529332, 'C': 0.4641588833612782}
Best score:  0.983420648759
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  6.8min finished


Best params:  {'kernel': 'linear', 'gamma': 2.1544346900318843, 'C': 1.0722672220103231}
Best score:  0.977635752532
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   54.8s finished


Best params:  {'kernel': 'linear', 'gamma': 0.0015199110829529332, 'C': 0.0040370172585965534}
Best score:  0.982350150933
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  4.0min finished


Best params:  {'kernel': 'rbf', 'gamma': 0.0030538555088334154, 'C': 4.9770235643321135}
Best score:  0.978463189718
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best params:  {'kernel': 'linear', 'gamma': 0.0015199110829529332, 'C': 0.016297508346206444}
Best score:  0.981706896582
CV Confusion:  [[115, 10, 13, 151], [114, 11, 6, 158], [110, 14, 12, 152], [115, 9, 4, 160], [107, 17, 6, 158]]
Best CV scores:  [ 0.9834  0.9776  0.9824  0.9785  0.9817]
Avg best scores:  0.980715327705
FC Classified as FC: 112.2, (90.192926045 %)
FC Classified as KD: 12.2, (9.80707395498 %)
KD Classified as FC: 8.2, (5.0 %)
KD Classified as KD: 155.8, (95.0 %)


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  4.7min finished


In [17]:
test_model(ScikitModel(linear_model.LogisticRegression(), 
                       params={
                           'C': np.logspace(-2, 2, 5)
                       },
                       random_search=False,
                       scoring='roc_auc',
                       verbose=True),
           x, y,
           threshold=0.4)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params:  {'C': 0.01}
Best score:  0.983330902425
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.2s finished


Best params:  {'C': 0.01}
Best score:  0.97744783695
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params:  {'C': 1.0}
Best score:  0.982086419115
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.2s finished


Best params:  {'C': 0.10000000000000001}
Best score:  0.978113205694
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params:  {'C': 0.10000000000000001}
Best score:  0.981677474335
CV Confusion:  [[112, 13, 6, 158], [114, 11, 3, 161], [111, 13, 7, 157], [115, 9, 4, 160], [105, 19, 6, 158]]
Best CV scores:  [ 0.9833  0.9774  0.9821  0.9781  0.9817]
Avg best scores:  0.980531167703
FC Classified as FC: 111.4, (89.5498392283 %)
FC Classified as KD: 13.0, (10.4501607717 %)
KD Classified as FC: 5.2, (3.17073170732 %)
KD Classified as KD: 158.8, (96.8292682927 %)


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.2s finished


## 3-Learner Ensemble

In [21]:
# Grid search params
clf1 = svm.SVC(probability=True)
clf2 = linear_model.LogisticRegression()

eclf = ensemble.VotingClassifier(
    estimators=[('svm', clf1), ('lr', clf2)],
    voting='soft')

params = {
    'svm__C': np.logspace(-3, 2, 100),
    'svm__gamma': np.logspace(-3, 2, 100),
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'lr__C': np.logspace(-2, 2, 100)
}

# Test model! 5-fold CV with hyperparameter optimization
clf = ScikitModel(
    eclf,
    params,
    random_search=True, 
    n_iter=75, 
    verbose=True)

test_model(clf, x, y, threshold=0.5)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=1)]: Done 375 out of 375 | elapsed:  2.1min finished


Best params:  {'svm__kernel': 'linear', 'svm__gamma': 0.26560877829466867, 'svm__C': 0.0015922827933410922, 'lr__C': 0.10235310218990264}
Best score:  0.983499294502
Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=1)]: Done 375 out of 375 | elapsed:  1.9min finished


Best params:  {'svm__kernel': 'linear', 'svm__gamma': 3.8535285937105273, 'svm__C': 2.71858824273294, 'lr__C': 0.014508287784959394}
Best score:  0.978231479603
Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=1)]: Done 375 out of 375 | elapsed:  2.2min finished


Best params:  {'svm__kernel': 'linear', 'svm__gamma': 15.556761439304722, 'svm__C': 0.0025353644939701114, 'lr__C': 39.442060594376599}
Best score:  0.98329789621
Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=1)]: Done 375 out of 375 | elapsed:  2.0min finished


Best params:  {'svm__kernel': 'linear', 'svm__gamma': 0.1668100537200059, 'svm__C': 0.016297508346206444, 'lr__C': 0.033516026509388425}
Best score:  0.978433381268
Fitting 5 folds for each of 75 candidates, totalling 375 fits
Best params:  {'svm__kernel': 'linear', 'svm__gamma': 0.42292428743894989, 'svm__C': 0.0035938136638046258, 'lr__C': 62.802914418342596}
Best score:  0.981996909575
CV Confusion:  [[117, 8, 13, 151], [115, 10, 6, 158], [112, 12, 11, 153], [117, 7, 4, 160], [109, 15, 6, 158]]
Best CV scores:  [ 0.9835  0.9782  0.9833  0.9784  0.982 ]
Avg best scores:  0.981091792232
FC Classified as FC: 114.0, (91.6398713826 %)
FC Classified as KD: 10.4, (8.36012861736 %)
KD Classified as FC: 8.0, (4.87804878049 %)
KD Classified as KD: 156.0, (95.1219512195 %)


[Parallel(n_jobs=1)]: Done 375 out of 375 | elapsed:  2.3min finished
