In [1]:
%matplotlib widget

In [2]:
import pandas as pd
import numpy as np
import random # shuffle the subjects
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import clone
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM, SVC
from sklearn.covariance import EllipticEnvelope

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.decomposition import PCA

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_curve, auc

import xgboost as xgb

from sklearn.model_selection import RandomizedSearchCV

In [3]:
def LOSO_runner(data, model, train_masks, validate_masks, test_masks, test=False):    
    # seperate the features out
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    
    # metrics
    accu = []
    prec = []
    rec = []
    f1 = []
    
    # return the models
    mdls = []
    
    # iterate over groups of N_lo subjects to leave out
    if test:
        itr = zip(train_masks, test_masks)
    else:
        itr = zip(train_masks, validate_masks)
    for train_mask, pred_mask in itr:
        clm = clone(model)
        clm.fit(feats.loc[train_mask], data.Label[train_mask])
        
        y_pred = clm.predict(feats.loc[pred_mask])
        y_true = data.Label[pred_mask]
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(clm)
    
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, accu, prec, rec, f1

In [4]:
def xgb_LOSO_runner(data, train_masks, validate_masks, test_masks, test=False):
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    labels = data.Label
    
    # metrics
    accu = []
    prec = []
    rec = []
    f1 = []
    
    # return the models
    mdls = []
    
    # iterate over groups of N_lo subjects to leave out    
    for train_mask, val_mask, test_mask in zip(train_masks, validate_masks, test_masks):
        dtrain = xgb.DMatrix(feats.loc[train_mask], label=labels.loc[train_mask])
        dval = xgb.DMatrix(feats.loc[val_mask], label=labels.loc[val_mask])
        if test:
            dtest = xgb.DMatrix(feats.loc[test_mask], label=labels.loc[test_mask])

        params = {
            'max_depth': 8,  # default
            'eval_metric': ['rmse', 'auc']
        }
        eval_list = [(dval, 'eval'), (dtrain, 'train')]

        bst = xgb.train(params, dtrain, 10, eval_list, verbose_eval=False)
        
        if test:
            y_pred = bst.predict(dtest) > 0.5
            y_true = labels.loc[test_mask]
        else:
            y_pred = bst.predict(dval) > 0.5
            y_true = labels.loc[val_mask]
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(bst)
        
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, accu, prec, rec, f1

In [5]:
def unary_LOSO_runner(data, model, train_masks, validate_masks, test_masks, test=False):
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    labels = data.Label
    
    # metrics
    accu = []
    prec = []
    rec = []
    f1 = []
    
    # return the models
    mdls = []
    
    # iterate over groups of N_lo subjects to leave out    
    if test:
        itr = zip(train_masks, test_masks)
    else:
        itr = zip(train_masks, validate_masks)
    for train_mask, pred_mask in itr:
        clm = clone(model)
        clm.fit(feats.loc[train_mask & (labels == 1)])
        
        y_pred = clm.predict(feats.loc[pred_mask])
        y_pred[y_pred == -1] = 0
        y_true = data.Label[pred_mask]
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(clm)
        
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, accu, prec, rec, f1

In [6]:
def plot_RF_feature_importances(models, features):
    ft_impt = pd.DataFrame(columns=['Feature', 'Importance'])
    ft_impt['Importance'] = [i for L in models for i in L.feature_importances_]
    ft_impt['Feature'] = np.tile(features, len(models))

    f, ax = plt.subplots(figsize=(10, 6))
    sns.violinplot(x='Feature', y='Importance', data=ft_impt, ax=ax, width=1, scale='width', bw=0.25)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    f.tight_layout()

In [7]:
def plot_xgb_feature_gains(models):
    df = pd.DataFrame(columns=['Feature', 'Split', 'Gain'])
    
    df['Feature'] = [i for d in models for i in d.get_score(importance_type='gain')]
    df['Split'] = np.repeat(np.arange(len(models)), len(models[0].get_score()))
    df['Gain'] = [i for d in models for i in d.get_score(importance_type='gain').values()]
    
    f, ax = plt.subplots(figsize=(10, 8))
    sns.boxplot(x='Feature', y='Gain', data=df)
    
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    f.tight_layout()

In [3]:
data = pd.read_hdf('../feature_exploration/features.h5', key='no_preprocessing')

In [4]:
# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = data.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

for i in range(0, len(loso_subjects), 3):
    tr_m = np.ones(data.shape[0], dtype='bool')
    v_m = np.zeros(data.shape[0], dtype='bool')
    
    for j in range(3):
        tr_m &= data.Subject != loso_subjects[i+j]
    for j in range(2):
        v_m |= data.Subject == loso_subjects[i+j]
    te_m = data.Subject == loso_subjects[i+2]
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)

masks = (training_masks, validation_masks, testing_masks)

### Domain expert feature selection

In [21]:
de_data = data.drop([
    'Mean',
    'Skewness',
    'Kurtosis',
    'RMS',
    'LinearSlope',
    'ComplexityInvariantDistance',
    'RangeCountPercentage',
    'RatioBeyondRSigma',
    'JerkMetric',
    'DominantFrequencyValue',
    'DetailPower',
    'SignalEntropy',
    'SpectralFlatness'
], axis=1)

### Scale features

In [33]:
rs_data = data.copy()
rs_data.iloc[:, 3:] = RobustScaler().fit_transform(data.iloc[:, 3:])
ss_data = data.copy()
ss_data.iloc[:, 3:] = StandardScaler().fit_transform(data.iloc[:, 3:])

### PCA

In [61]:
pca = PCA(n_components=0.95)
pca_data = data.iloc[:, :3].copy()
pca_data = pca_data.merge(pd.DataFrame(pca.fit_transform(rs_data.iloc[:, 3:])), left_index=True, right_index=True)

# Binary Classifiers

## Random Forest

In [62]:
RF = RandomForestClassifier(n_estimators=20)

rf_mdls, fts, *rf_metrics = LOSO_runner(data, RF, training_masks, validation_masks, testing_masks, test=False)
print('\nRobust Scaling')
rf_rs_mdls, rs_fts, *rf_rs_metrics = LOSO_runner(rs_data, RF, *masks, test=False)
print('\nStandard Scaling')
rf_ss_mdls, ss_fts, *rf_ss_metrics = LOSO_runner(ss_data, RF, *masks, test=False)
print('\nPCA (robust scaling)')
rf_pca_mdls, pca_fts, *rf_pca_metrics = LOSO_runner(pca_data, RF, *masks, test=False)

Average Accuracy: 0.92 (0.03)
Average Precision: 0.84 (0.09)
Average Recall: 0.81 (0.15)
Average F1 Score: 0.81 (0.10)

Robust Scaling
Average Accuracy: 0.92 (0.03)
Average Precision: 0.85 (0.09)
Average Recall: 0.81 (0.14)
Average F1 Score: 0.82 (0.09)

Standard Scaling
Average Accuracy: 0.92 (0.03)
Average Precision: 0.85 (0.09)
Average Recall: 0.81 (0.15)
Average F1 Score: 0.82 (0.09)

PCA (robust scaling)
Average Accuracy: 0.89 (0.04)
Average Precision: 0.76 (0.11)
Average Recall: 0.77 (0.14)
Average F1 Score: 0.76 (0.10)


In [36]:
plot_RF_feature_importances(rf_mdls, fts)
plot_RF_feature_importances(rf_rs_mdls, fts)
plot_RF_feature_importances(rf_ss_mdls, fts)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
# data_subset = data.drop(['Skewness', 'Kurtosis', 'Autocorrelation', 'LinearSlope', 'SignalEntropy', 
#                          'ComplexityInvariantDistance', 'RangeCountPercentage', 'RatioBeyondRSigma',
#                          'SpectralFlatness', 'Mean', 'MeanCrossRate', 'DominantFrequencyValue', 'RMS', 'DetailPowerRatio'], axis=1)
data_subset = data.drop([
    'LinearSlope',
    'RatioBeyondRSigma',
    'SpectralFlatness',
    'ComplexityInvariantDistance',
    'Kurtosis',
    'Autocorrelation',
    'DominantFrequencyValue',
    'SignalEntropy',
    'RangeCountPercentage'
], axis=1)

In [13]:
# check performance after dropping some of the less important features
rfsub_mdls, sub_fts, *rfsub_metrics = LOSO_runner(data_subset, RF, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.92 (0.03)
Average Precision: 0.84 (0.09)
Average Recall: 0.81 (0.14)
Average F1 Score: 0.81 (0.09)


In [14]:
plot_RF_feature_importances(rfsub_mdls, sub_fts)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
# check performance using DE features
rfde_mdls, de_fts, *rfde_metrics = LOSO_runner(de_data, RF, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.91 (0.04)
Average Precision: 0.80 (0.10)
Average Recall: 0.80 (0.14)
Average F1 Score: 0.79 (0.09)


## SVM Classifier

In [18]:
svc = SVC(C=1.0, kernel='rbf')

svc_mdls, fts, *svc_metrics = LOSO_runner(data_subset, svc, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.87 (0.05)
Average Precision: 0.82 (0.05)
Average Recall: 0.88 (0.15)
Average F1 Score: 0.84 (0.09)


## XGBoost

In [63]:
print('No Scaling')
xgb_mdls, xgb_fts, xgb_accu, xgb_prec, xgb_rec, xgb_f1 = xgb_LOSO_runner(data, training_masks, validation_masks, testing_masks, test=False)
print('\nRobust Scaling')
xgb_rs_mdls, xgb_rs_fts, *xgb_rs_metr = xgb_LOSO_runner(rs_data, *masks, test=False)
print('\nStandard Scaling')
xgb_ss_mdls, xgb_ss_fts, *xgb_ss_metr = xgb_LOSO_runner(ss_data, *masks, test=False)
print('\nPCA (robust scaling)')
xgb_pca_mdls, xgb_pca_fts, *xgb_pca_metr = xgb_LOSO_runner(pca_data, *masks, test=False)

No Scaling
Average Accuracy: 0.91 (0.04)
Average Precision: 0.81 (0.10)
Average Recall: 0.82 (0.15)
Average F1 Score: 0.80 (0.09)

Robust Scaling
Average Accuracy: 0.91 (0.04)
Average Precision: 0.81 (0.10)
Average Recall: 0.82 (0.15)
Average F1 Score: 0.80 (0.09)

Standard Scaling
Average Accuracy: 0.91 (0.04)
Average Precision: 0.81 (0.10)
Average Recall: 0.82 (0.15)
Average F1 Score: 0.80 (0.09)

PCA (robust scaling)
Average Accuracy: 0.88 (0.04)
Average Precision: 0.71 (0.13)
Average Recall: 0.81 (0.14)
Average F1 Score: 0.75 (0.10)


In [17]:
# data_subset_2 = data.drop(['Mean', 'RatioBeyondRSigma', 'SpectralEntropy', 'SpectralFlatness', 'LinearSlope', 'RMS', 'Skewness', 
#                            'Kurtosis', 'Autocorrelation', 'DominantFrequencyValue', 'IQR', 
#                            'MeanCrossRate', 'ComplexityInvariantDistance', 'RangeCountPercentage', 'SignalEntropy', 
#                            'DimensionlessJerk', 'DetailPowerRatio', 'JerkMetric', 'SampleEntropy'], axis=1)
data_subset_2 = data.drop([
    'LinearSlope',
    'SPARC',
    'RatioBeyondRSigma',
    'Mean',
    'Kurtosis', 
    'PowerSpectralSum',
    'IQR',
    'RangeCountPercentage',
    'DominantFrequencyValue',
    'ComplexityInvariantDistance'
], axis=1)

In [18]:
xgb_s_mdls, xgb_s_fts, xgb_s_accu, xgb_s_prec, xgb_s_rec, xgb_s_f1 = xgb_LOSO_runner(data_subset_2, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.91 (0.04)
Average Precision: 0.79 (0.11)
Average Recall: 0.84 (0.14)
Average F1 Score: 0.80 (0.09)


In [19]:
plot_xgb_feature_gains(xgb_s_mdls)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
xgb_de_mdls, xgb_de_fts, xgb_de_accu, xgb_de_prec, xgb_de_rec, xgb_de_f1 = xgb_LOSO_runner(de_data, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.90 (0.04)
Average Precision: 0.76 (0.12)
Average Recall: 0.83 (0.15)
Average F1 Score: 0.78 (0.10)


# One-class Classifiers

## Isolation Forest

In [17]:
IF = IsolationForest(n_estimators=20)

if_mdls, if_fts, *if_metr = unary_LOSO_runner(data, IF, *masks, test=False)

Average Accuracy: 0.71 (0.11)
Average Precision: 0.44 (0.20)
Average Recall: 0.84 (0.12)
Average F1 Score: 0.56 (0.18)


## One-class SVM

In [20]:
usvc = OneClassSVM(kernel='rbf')

usvc_mdls, _, *usvc_metr = unary_LOSO_runner(data, usvc, *masks, test=False)

Average Accuracy: 0.80 (0.05)
Average Precision: 0.54 (0.16)
Average Recall: 0.44 (0.19)
Average F1 Score: 0.46 (0.16)


# XGBoost Random Hyperparameter search

In [12]:
# take top half of features based on PP score
dtophalf = data.drop([
    'Skewness',
    'Kurtosis',
    'LinearSlope',
    'SpectralFlatness',
    'Autocorrelation',
    'RangeCountPercentage',
    'ComplexityInvariantDistance',
    'PowerSpectralSum',
    'RatioBeyondRSigma',
    'SignalEntropy',
    'DominantFrequencyValue',
    'JerkMetric',  # add mean cross rate, remove Jerkmetric (correlation with DimensionlessJerk)
    'StdDev'  # add mean, remove StdDev (high correlation with RMS)
], axis=1)

In [13]:
from scipy.stats import uniform, loguniform, randint
from sklearn.metrics import make_scorer

In [14]:
estimator = xgb.XGBClassifier()
param_distributions = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': loguniform(1e-6, 1e-1),
    'booster': ['gbtree', 'gblinear', 'dart'],
}
n_iter = 15 
scoring = make_scorer(f1_score)
n_jobs = -1  # use all possible cores
refit = False  # don't want to refit on the whole dataset afterwards
verbose = 2
cv = ((training_masks[i], validation_masks[i]) for i in range(len(training_masks)))

In [15]:
clf = RandomizedSearchCV(
    estimator,
    param_distributions,
    cv=cv,
    n_iter=n_iter,
    scoring=scoring,
    n_jobs=n_jobs,
    refit=refit,
    verbose=verbose
)

In [16]:
search = clf.fit(dtophalf.iloc[:, 3:], dtophalf.Label)

Fitting 15 folds for each of 15 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed: 12.3min finished


In [18]:
cv_results = pd.DataFrame(data=search.cv_results_)
cv_results.to_csv('xgboost_cv_results_topfeats.csv')
cv_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_learning_rate,param_max_depth,params,split0_test_score,split1_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
5,28.989912,0.777379,0.027051,0.007716,gbtree,0.0951225,5,"{'booster': 'gbtree', 'learning_rate': 0.09512...",0.90178,0.894281,...,0.913772,0.906854,0.75834,0.601783,0.797901,0.675041,0.549296,0.785832,0.113698,1
0,55.253169,1.908233,0.039049,0.011232,gbtree,0.0114093,9,"{'booster': 'gbtree', 'learning_rate': 0.01140...",0.886945,0.850673,...,0.902429,0.883607,0.732016,0.592488,0.816368,0.633138,0.545301,0.76394,0.113126,2
7,41.492019,1.127223,0.030755,0.009024,gbtree,0.02291,7,"{'booster': 'gbtree', 'learning_rate': 0.02290...",0.882979,0.810675,...,0.893706,0.894093,0.735791,0.58155,0.806743,0.627031,0.520868,0.760269,0.119998,3
1,53.864725,1.623649,0.037767,0.011843,gbtree,0.00479575,9,"{'booster': 'gbtree', 'learning_rate': 0.00479...",0.875912,0.822805,...,0.89934,0.880279,0.706331,0.577191,0.813242,0.616095,0.565008,0.751631,0.114009,4
10,64.079662,1.572281,0.042683,0.012507,gbtree,0.000165727,11,"{'booster': 'gbtree', 'learning_rate': 0.00016...",0.860172,0.817381,...,0.892679,0.87513,0.697885,0.542614,0.805259,0.61986,0.519722,0.74552,0.115467,5
13,103.609675,5.342022,0.041676,0.015322,dart,3.54542e-05,11,"{'booster': 'dart', 'learning_rate': 3.5454233...",0.857143,0.814044,...,0.887124,0.879083,0.693055,0.542614,0.808938,0.621359,0.50702,0.744235,0.116291,6
14,48.361844,7.312645,0.029272,0.014846,gbtree,4.33497e-05,9,"{'booster': 'gbtree', 'learning_rate': 4.33496...",0.864865,0.789474,...,0.83062,0.853306,0.702784,0.544386,0.811051,0.602919,0.723377,0.739806,0.103375,7
11,53.161303,1.660655,0.034522,0.01091,gbtree,0.000700963,9,"{'booster': 'gbtree', 'learning_rate': 0.00070...",0.870634,0.799541,...,0.873622,0.859269,0.707085,0.532086,0.776045,0.606785,0.533858,0.729246,0.119292,8
2,46.602757,1.24627,0.02133,0.006126,dart,0.00647376,5,"{'booster': 'dart', 'learning_rate': 0.0064737...",0.835346,0.714045,...,0.814026,0.88744,0.685131,0.498634,0.820829,0.434634,0.557089,0.704958,0.147764,9
4,17.874733,0.588752,0.017218,0.004612,gbtree,0.0168764,3,"{'booster': 'gbtree', 'learning_rate': 0.01687...",0.832827,0.672439,...,0.763048,0.867777,0.64582,0.531959,0.798623,0.491004,0.691014,0.689647,0.122094,10


# RandomForestClassifier Random CV search

In [59]:
estimator = RandomForestClassifier()
param_distributions = {
    'n_estimators': randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_depth': [4, 7, 10, 13, 16, None],
    'min_samples_split': [2, 10, 20, 50, 100, 500, 1000],
    'min_samples_leaf': [1, 2, 4, 8, 16, 32]
}
n_iter = 25
scoring = make_scorer(f1_score)
n_jobs = -1  # use all possible cores
refit = False  # don't want to refit on the whole dataset afterwards
verbose = 2
cv = ((training_masks[i], validation_masks[i]) for i in range(len(training_masks)))

In [60]:
clf = RandomizedSearchCV(
    estimator,
    param_distributions,
    cv=cv,
    n_iter=n_iter,
    scoring=scoring,
    n_jobs=n_jobs,
    refit=refit,
    verbose=verbose
)

In [61]:
search = clf.fit(data.iloc[:, 3:], data.Label)

Fitting 15 folds for each of 25 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   30.7s


KeyboardInterrupt: 

# XGBoost cutoff analysis

In [5]:
cvr = pd.read_csv('xgboost_cv_results.csv', index_col=0)
cvr.sort_values('mean_test_score', ascending=False).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_learning_rate,param_max_depth,params,split0_test_score,split1_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
9,130.157791,4.596821,0.046471,0.012082,gbtree,5.5e-05,11,"{'booster': 'gbtree', 'learning_rate': 5.45993...",0.873916,0.847865,...,0.906433,0.908223,0.798942,0.625861,0.774995,0.736842,0.568938,0.773441,0.089838,1
11,106.54422,3.497599,0.038598,0.011275,gbtree,0.000103,9,"{'booster': 'gbtree', 'learning_rate': 0.00010...",0.877099,0.828001,...,0.887097,0.900111,0.782918,0.615385,0.815859,0.649054,0.609105,0.759521,0.091498,2
2,57.827195,1.315661,0.023987,0.006872,gbtree,0.010004,5,"{'booster': 'gbtree', 'learning_rate': 0.01000...",0.870968,0.708221,...,0.862694,0.893108,0.781667,0.608817,0.815188,0.635332,0.568038,0.739639,0.105794,3
7,105.427766,2.409746,0.029812,0.009498,dart,9.8e-05,7,"{'booster': 'dart', 'learning_rate': 9.7752501...",0.896417,0.805088,...,0.869712,0.886669,0.773984,0.595556,0.771429,0.644856,0.54902,0.73776,0.106544,4
1,81.375426,2.800545,0.027003,0.007098,gbtree,9e-06,7,"{'booster': 'gbtree', 'learning_rate': 9.33257...",0.892927,0.799109,...,0.869712,0.886241,0.773984,0.595556,0.77403,0.644856,0.548552,0.736368,0.105799,5


In [6]:
cvr2 = pd.read_csv('xgboost_cv_results_topfeats.csv', index_col=0)
cvr2.sort_values('mean_test_score', ascending=False).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_learning_rate,param_max_depth,params,split0_test_score,split1_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
5,28.989912,0.777379,0.027051,0.007716,gbtree,0.095123,5,"{'booster': 'gbtree', 'learning_rate': 0.09512...",0.90178,0.894281,...,0.913772,0.906854,0.75834,0.601783,0.797901,0.675041,0.549296,0.785832,0.113698,1
0,55.253169,1.908233,0.039049,0.011232,gbtree,0.011409,9,"{'booster': 'gbtree', 'learning_rate': 0.01140...",0.886945,0.850673,...,0.902429,0.883607,0.732016,0.592488,0.816368,0.633138,0.545301,0.76394,0.113126,2
7,41.492019,1.127223,0.030755,0.009024,gbtree,0.02291,7,"{'booster': 'gbtree', 'learning_rate': 0.02290...",0.882979,0.810675,...,0.893706,0.894093,0.735791,0.58155,0.806743,0.627031,0.520868,0.760269,0.119998,3
1,53.864725,1.623649,0.037767,0.011843,gbtree,0.004796,9,"{'booster': 'gbtree', 'learning_rate': 0.00479...",0.875912,0.822805,...,0.89934,0.880279,0.706331,0.577191,0.813242,0.616095,0.565008,0.751631,0.114009,4
10,64.079662,1.572281,0.042683,0.012507,gbtree,0.000166,11,"{'booster': 'gbtree', 'learning_rate': 0.00016...",0.860172,0.817381,...,0.892679,0.87513,0.697885,0.542614,0.805259,0.61986,0.519722,0.74552,0.115467,5
