In [1]:
cd ..

/home/vmadmin/pass


In [2]:
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, make_scorer, confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

from database.utils import get_train_test_data
from pipelines.feature_extractor import get_feature_extractor
from pipelines.models import get_ensemble_model
from feature_extraction.features import get_glove_w2v
from evaluation.metrics import class_report

from copy import deepcopy
import numpy as np
import pandas as pd
import pprint
import time

%load_ext autoreload
%autoreload 2

In [3]:
w2v = get_glove_w2v()
train_test_data = get_train_test_data()
feature_extractor = get_feature_extractor(w2v)

Found 400000 word vectors.


In [4]:
ensemble = get_ensemble_model(w2v)

In [7]:
ensemble = get_ensemble_model(w2v)
ensemble.steps = ensemble.steps[2:]

In [None]:
%%time

models = [("lr", LogisticRegression(C=0.1, penalty='l2', solver='lbfgs', n_jobs=-1)),
           ("nb", BernoulliNB(alpha=5.0)),
           ("rf", RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, n_jobs=-1)),
           ("xgb", XGBClassifier(n_estimators=300, max_depth=8, n_jobs=-1)),
           ("et", ExtraTreesClassifier(n_estimators=300, max_depth=10, min_samples_split=10, n_jobs=-1)),
           ("svm", SVC(C=100, gamma=0.0001, probability=True)),
           ("ensemble", ensemble),
           #("nbsvm", )
         ]

results = {}

for Xr_train, y_train, Xr_test, y_test, indicator in train_test_data:
    X_train = feature_extractor.fit_transform(Xr_train, y_train)
    X_test = feature_extractor.transform(Xr_test)
    
    results[indicator] = {}
    for name, classifier in models:
        results[indicator][name] = {}
        
        cv = StratifiedKFold(n_splits=5, random_state=42)
        scores = []
        conf_mat = np.zeros((2, 2))      # Binary classification
        false_pos = set()
        false_neg = set()
        train_times = []
        predict_times = []
        
        for dev_i, val_i in cv.split(X_train, y_train):
            clf = deepcopy(classifier)
            X_dev, X_val = X_train[dev_i], X_train[val_i]
            y_dev, y_val = y_train[dev_i], y_train[val_i]
            ts = time.time()

            clf.fit(X_dev, y_dev)
            te = time.time()
            
            train_times.append(te - ts)
            
            ts = time.time()
            y_pprobs = clf.predict_proba(X_val)       # Predicted probabilities
            te = time.time()
            
            predict_times.append(te - ts)
            
            y_plabs = np.squeeze(clf.predict(X_val))  # Predicted class labels

            scores.append(roc_auc_score(y_val, y_pprobs[:, 1]))
            confusion = confusion_matrix(y_val, y_plabs)
            conf_mat += confusion

            # Collect indices of false positive and negatives
            fp_i = np.where((y_plabs==1) & (y_val==0))[0]
            fn_i = np.where((y_plabs==0) & (y_val==1))[0]
            false_pos.update(val_i[fp_i])
            false_neg.update(val_i[fn_i])

        classifier.fit(X_train, y_train)
        y_scores_test = classifier.predict_proba(X_test)
        results[indicator][name]['test_roc_auc'] = roc_auc_score(y_test, y_scores_test[:, 1])        
            
        print("\n[%s][%s] Mean score: %0.2f (+/- %0.2f)" % (indicator, name, np.mean(scores), np.std(scores) * 2))
        #conf_mat /= 5
        #print("Mean CM: \n", conf_mat)
        #print("\nMean classification measures: \n")
        measures = class_report(conf_mat)
        for metric in measures:
            results[indicator][name][metric] = measures[metric]
        
        results[indicator][name]['mean_roc_auc'] = np.mean(scores)
        results[indicator][name]['std_roc_auc'] = np.std(scores) * 2
        results[indicator][name]['train_time'] = np.mean(train_times)
        results[indicator][name]['predict_time'] = np.mean(predict_times)
        
        #pprint.pprint(measures)


[sleep][lr] Mean score: 0.78 (+/- 0.12)

[sleep][nb] Mean score: 0.88 (+/- 0.14)

[sleep][rf] Mean score: 0.88 (+/- 0.13)


In [160]:
pd.DataFrame(results['physical_activity']).T

Unnamed: 0,accuracy,f1score,mean_roc_auc,precision,predict_time,sensitivity,specificity,std_roc_auc,test_roc_auc,train_time
ensemble,0.792489,0.810048,0.913305,0.78547,0.790065,0.836215,0.743354,0.155069,0.889569,17.886176
et,0.76649,0.790497,0.905339,0.782051,0.312031,0.799127,0.726395,0.164566,0.824836,2.629413
lr,0.728455,0.755844,0.804628,0.746154,0.00098,0.765789,0.683031,0.228363,0.811466,2.614345
nb,0.773231,0.788315,0.847282,0.749573,0.005132,0.83128,0.713307,0.320457,0.771623,0.046894
rf,0.771786,0.790636,0.909587,0.764957,0.310849,0.818099,0.720244,0.147509,0.848782,2.882965
svm,0.761675,0.780682,0.855451,0.752991,0.460819,0.810488,0.708081,0.204334,0.825784,10.888322
xgb,0.800674,0.81435,0.910588,0.776068,0.098998,0.856604,0.74238,0.170339,0.900501,7.183596


In [161]:
pd.DataFrame(results['sleep']).T

Unnamed: 0,accuracy,f1score,mean_roc_auc,precision,predict_time,sensitivity,specificity,std_roc_auc,test_roc_auc,train_time
ensemble,0.765847,0.800441,0.890135,0.746914,0.458728,0.862233,0.650568,0.204734,0.69304,11.061856
et,0.771022,0.823881,0.88112,0.851852,0.210925,0.797688,0.716535,0.238809,0.731502,2.335816
lr,0.712807,0.773931,0.784453,0.781893,0.000685,0.766129,0.617329,0.184962,0.604469,8.373171
nb,0.796895,0.821793,0.90251,0.744856,0.00306,0.916456,0.671958,0.163856,0.642784,0.007031
rf,0.771022,0.808649,0.876677,0.769547,0.211289,0.851936,0.664671,0.254129,0.68,2.480585
svm,0.68564,0.756757,0.777757,0.777778,0.054736,0.736842,0.584615,0.17548,0.593553,1.116513
xgb,0.734799,0.773481,0.862617,0.720165,0.028538,0.835322,0.615819,0.283987,0.672088,2.057591


In [162]:
pd.DataFrame(results['sedentary_behaviour']).T

Unnamed: 0,accuracy,f1score,mean_roc_auc,precision,predict_time,sensitivity,specificity,std_roc_auc,test_roc_auc,train_time
ensemble,0.715917,0.813245,0.740807,0.857941,0.92983,0.772975,0.487395,0.283471,0.800246,16.078074
et,0.688381,0.806908,0.640884,0.903121,0.413453,0.729222,0.347962,0.35567,0.773181,2.461106
lr,0.659503,0.778603,0.564141,0.830461,0.000846,0.73284,0.33211,0.263578,0.701132,2.339622
nb,0.76595,0.841049,0.740467,0.858873,0.004322,0.82395,0.590541,0.357476,0.75786,0.011948
rf,0.702485,0.808308,0.675707,0.870051,0.415059,0.754747,0.445328,0.333309,0.777234,2.83668
svm,0.683009,0.807896,0.595116,0.924546,0.541317,0.717383,0.232227,0.278361,0.735958,12.855562
xgb,0.702821,0.800181,0.712136,0.825338,0.066427,0.776512,0.461207,0.251878,0.804717,6.191699


In [20]:
%%time

models = [("lr", LogisticRegression(C=0.1, penalty='l2', solver='lbfgs', n_jobs=-1)),
           ("nb", BernoulliNB(alpha=5.0)),
           ("rf", RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, n_jobs=-1)),
           ("xgb", XGBClassifier(n_estimators=300, max_depth=8, n_jobs=-1)),
           ("et", ExtraTreesClassifier(n_estimators=300, max_depth=10, min_samples_split=10, n_jobs=-1)),
           ("svm", SVC(C=100, gamma=0.0001, probability=True)),
           ("ensemble", ensemble),
           #("nbsvm", )
         ]

results = {}

for X_raw, y, indicator in data:
    X = feature_extractor.fit_transform(X_raw, y)
    results[indicator] = {}
    for name, classifier in models:
        results[indicator][name] = {}
        
        cv = StratifiedKFold(n_splits=5, random_state=42)
        scores = []
        conf_mat = np.zeros((2, 2))      # Binary classification
        false_pos = set()
        false_neg = set()
        train_times = []
        
        for train_i, val_i in cv.split(X, y):
            clf = deepcopy(classifier)
            X_train, X_val = X[train_i], X[val_i]
            y_train, y_val = y[train_i], y[val_i]
            ts = time.time()

            clf.fit(X_train, y_train)
            te = time.time()
            
            train_times.append(te - ts)
            
            y_pprobs = clf.predict_proba(X_val)       # Predicted probabilities
            y_plabs = np.squeeze(clf.predict(X_val))  # Predicted class labels

            scores.append(roc_auc_score(y_val, y_pprobs[:, 1]))
            confusion = confusion_matrix(y_val, y_plabs)
            conf_mat += confusion

            # Collect indices of false positive and negatives
            fp_i = np.where((y_plabs==1) & (y_val==0))[0]
            fn_i = np.where((y_plabs==0) & (y_val==1))[0]
            false_pos.update(val_i[fp_i])
            false_neg.update(val_i[fn_i])

        print("\n[%s][%s] Mean score: %0.2f (+/- %0.2f)" % (indicator, name, np.mean(scores), np.std(scores) * 2))
        #conf_mat /= 5
        #print("Mean CM: \n", conf_mat)
        #print("\nMean classification measures: \n")
        measures = class_report(conf_mat)
        for metric in measures:
            results[indicator][name][metric] = measures[metric]
        
        results[indicator][name]['mean_roc_auc'] = np.mean(scores)
        results[indicator][name]['std_roc_auc'] = np.std(scores) * 2
        results[indicator][name]['train_time'] = np.mean(train_times)
        #pprint.pprint(measures)


[sleep][lr] Mean score: 0.87 (+/- 0.04)

[sleep][nb] Mean score: 0.95 (+/- 0.02)

[sleep][rf] Mean score: 0.90 (+/- 0.06)

[sleep][xgb] Mean score: 0.86 (+/- 0.03)

[sleep][et] Mean score: 0.90 (+/- 0.04)

[sleep][svm] Mean score: 0.86 (+/- 0.04)

[sleep][ensemble] Mean score: 0.92 (+/- 0.05)

[sedentary_behaviour][lr] Mean score: 0.87 (+/- 0.15)

[sedentary_behaviour][nb] Mean score: 0.93 (+/- 0.16)

[sedentary_behaviour][rf] Mean score: 0.85 (+/- 0.26)

[sedentary_behaviour][xgb] Mean score: 0.90 (+/- 0.16)

[sedentary_behaviour][et] Mean score: 0.86 (+/- 0.23)

[sedentary_behaviour][svm] Mean score: 0.86 (+/- 0.16)

[sedentary_behaviour][ensemble] Mean score: 0.90 (+/- 0.19)

[physical_activity][lr] Mean score: 0.88 (+/- 0.06)

[physical_activity][nb] Mean score: 0.97 (+/- 0.03)

[physical_activity][rf] Mean score: 0.88 (+/- 0.10)

[physical_activity][xgb] Mean score: 0.86 (+/- 0.07)

[physical_activity][et] Mean score: 0.84 (+/- 0.11)

[physical_activity][svm] Mean score: 0.86 (+/

In [21]:
pd.DataFrame(results['physical_activity']).T

Unnamed: 0,accuracy,f1score,mean_roc_auc,precision,sensitivity,specificity,std_roc_auc,train_time
ensemble,0.951934,0.975319,0.902961,1.0,0.951827,1.0,0.067394,9.435405
et,0.948066,0.973341,0.839654,0.998255,0.94964,0.0,0.110316,2.243759
lr,0.953591,0.976,0.879777,0.993601,0.959012,0.62069,0.06369,0.793034
nb,0.957459,0.977969,0.97051,0.994183,0.962275,0.705882,0.033964,0.008714
rf,0.949171,0.973878,0.881364,0.997673,0.951192,0.428571,0.09511,2.445561
svm,0.951934,0.975319,0.855466,1.0,0.951827,1.0,0.061755,0.966141
xgb,0.949724,0.974126,0.856645,0.99651,0.952725,0.5,0.073139,3.182969


In [43]:
pd.DataFrame(results['sedentary_behaviour']).T

Unnamed: 0,accuracy,f1score,mean_roc_auc,precision,sensitivity,specificity,std_roc_auc,train_time
ensemble,0.937127,0.966639,0.904497,0.996084,0.938884,0.88,0.187142,12.326693
et,0.934739,0.965517,0.860333,0.99913,0.934093,0.963636,0.234821,2.352192
lr,0.93275,0.964081,0.872131,0.986945,0.942252,0.716981,0.153806,0.926467
nb,0.958615,0.977547,0.92724,0.985205,0.970009,0.810056,0.157551,0.016102
rf,0.935137,0.965691,0.854135,0.998259,0.935181,0.933333,0.255848,2.608832
svm,0.929964,0.962885,0.856396,0.993473,0.934124,0.782609,0.15678,4.435725
xgb,0.935535,0.965823,0.900458,0.996084,0.937346,0.873239,0.162439,5.330778


In [44]:
pd.DataFrame(results['sleep']).T

Unnamed: 0,accuracy,f1score,mean_roc_auc,precision,sensitivity,specificity,std_roc_auc,train_time
ensemble,0.928571,0.962445,0.918315,0.991899,0.93469,0.64,0.048971,9.363486
et,0.930233,0.96351,0.902579,0.9982,0.93115,0.846154,0.039138,2.297962
lr,0.930233,0.96319,0.86739,0.989199,0.938514,0.636364,0.041203,0.737107
nb,0.937708,0.966048,0.948957,0.960396,0.971767,0.584906,0.018239,0.007519
rf,0.928571,0.962641,0.89746,0.9973,0.930311,0.769231,0.063915,2.426127
svm,0.927741,0.962256,0.858036,0.9982,0.928811,0.8,0.041497,0.939432
xgb,0.92608,0.960982,0.862861,0.986499,0.936752,0.558824,0.025773,3.227897


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from feature_extraction.nbsvm import NBSVM, NbSvmClassifier
import re, string

re_tok = re.compile('([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()


tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )


nbsvm = NbSvmClassifier(C=4, dual=True, n_jobs=-1)

X_tfidf = tfidf.fit_transform(X_raw)
cross_val_score(nbsvm, X_tfidf, y, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True))

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


array([ 0.80977106,  0.72996681,  0.79706022,  0.89414414,  0.81781782])