In [18]:
import load_and_preprocessing_functions as lp
d, l = lp.get_pcg_data_and_labels(to_freq=1000, label_type='stai-3520')

In [19]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import cv_and_classification_functions as cvc
import numpy as np
import pandas as pd
from plotly import graph_objects as go
import plotly.io as pio

In [20]:
xgb_space = {'n_estimators': hp.quniform("n_estimators", 20,300,1),
            'eta': hp.uniform("eta", 0,1),
            'max_depth': hp.quniform("max_depth", 3, 20, 1),
            'gamma': hp.uniform ('gamma', 0,12),
            'reg_alpha' : hp.quniform('reg_alpha', 0,110,1),
            'reg_lambda' : hp.uniform('reg_lambda', 0,2),
            'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
            'min_child_weight' : hp.quniform('min_child_weight', 0, 6, 1),
            'seed': 0
    }

knn_space = {'n_neighbors': hp.quniform('n_neighbors', 1, 101, 1),
             'd': hp.quniform('d', 1, 2, 1)
            }

In [21]:
J=3
Q=12
T=2**J
sec = 10
L = int(np.floor(sec*1000/2**J))*2**J
n_splits=10
x_train_scatter, y_train_scatter, x_val_scatter, y_val_scatter =\
                cvc.make_cross_val_scatter_data(J=J, Q=Q, L=L, data=d, labels=l, n_splits=n_splits, normalize=True, feature_reduction=True, sr=1000)


The least populated class in y has only 4 members, which is less than n_splits=10.



In [29]:
from sklearn.preprocessing import StandardScaler
def objective(space, n_splits=n_splits):
    y_ts = []
    y_ps = []
    for i in range(n_splits):
        clf=xgb.XGBClassifier(n_estimators= int(space['n_estimators']), eta=space['eta'],
                    max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']), min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=space['colsample_bytree'], reg_lambda=space['reg_lambda'], tree_method='hist')
        
        scaler = StandardScaler()
        x_train_s = scaler.fit_transform(x_train_scatter[i])
        x_val_s = scaler.transform(x_val_scatter[i])
        clf.fit(x_train_s, y_train_scatter[i])
        y_true, y_pred = cvc.L_predictor(clf, L, 2**J, x_val_s, y_val_scatter[i])
        y_ts += list(y_true)
        y_ps += list(y_pred)
        
    print ('SCORE:', accuracy_score(y_ts, y_ps))

    return {'loss': -accuracy_score(y_ts, y_ps), 'status': STATUS_OK }

trials_xgb = Trials()

best_hyperparams_xgb = fmin(fn = objective,
                        space = xgb_space,
                        algo = tpe.suggest, 
                        max_evals = 100,
                        trials = trials_xgb)

SCORE:                                                 
0.8431372549019608                                     
SCORE:                                                                            
0.8470588235294118                                                                
SCORE:                                                                              
0.8411764705882353                                                                  
SCORE:                                                                               
0.8490196078431372                                                                   
SCORE:                                                                               
0.8411764705882353                                                                  
SCORE:                                                                              
0.8411764705882353                                                                  
SCORE:                                 

In [30]:
# This is a simple helper function that allows us to fill in `np.nan` when a particular
# hyperparameter is not relevant to a particular trial.
def unpack(x):
    if x:
        return x[0]
    return np.nan


# We'll first turn each trial into a series and then stack those series together as a dataframe.
trials_df_xgb = pd.DataFrame([pd.Series(t["misc"]["vals"]).apply(unpack) for t in trials_xgb])
# Then we'll add other relevant bits of information to the correct rows and perform a couple of
# mappings for convenience
trials_df_xgb["loss"] = [t["result"]["loss"] for t in trials_xgb]
trials_df_xgb["trial_number"] = trials_df_xgb.index



In [31]:
pd.to_pickle(trials_df_xgb, 'results/classifier_opt_trials/stai3030_xgb_opt_30_4_12.pkl')

In [104]:
trials_df_xgb = pd.read_pickle('results/classifier_opt_trials/stai3520_xgb_opt_10_3_12.pkl')

In [105]:
trials_df_xgb[trials_df_xgb['loss'].min() == trials_df_xgb['loss']]

Unnamed: 0,colsample_bytree,eta,gamma,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,loss,trial_number
37,0.858025,0.829672,2.005672,7.0,3.0,81.0,52.0,0.631727,-0.943012,37


In [95]:
# plotly express does not support contour plots so we will use `graph_objects` instead. `go.Contour
# automatically interpolates "z" values for our loss.
fig = go.Figure(
    data=go.Contour(
        z=trials_df_xgb["loss"],
        x=trials_df_xgb["reg_alpha"],
        y=trials_df_xgb["reg_lambda"],
        contours=dict(
            showlabels=True,  # show labels on contours
            labelfont=dict(size=12, color="white",),  # label font properties
        ),
        colorbar=dict(title="loss", titleside="right",),
        hovertemplate="loss: %{z}<br>reg_alpha: %{x}<br>reg_lambda: %{y}<extra></extra>",
    )
)

fig.update_layout(
    xaxis_title="reg_alpha",
    yaxis_title="reg_lambda",
    title={
        "text": "reg_lambda vs. reg_alpha | 100 evaluations",
        "xanchor": "center",
        "yanchor": "top",
        "x": 0.5,
    },
)

pio.write_image(fig, 'results/Classifier_opt/stai3030_alpha_vs_lambda.pdf', format='pdf', width=1000, height=400)

In [96]:
# plotly express does not support contour plots so we will use `graph_objects` instead. `go.Contour
# automatically interpolates "z" values for our loss.
fig = go.Figure(
    data=go.Contour(
        z=trials_df_xgb["loss"],
        x=trials_df_xgb["n_estimators"],
        y=trials_df_xgb["eta"],
        contours=dict(
            showlabels=True,  # show labels on contours
            labelfont=dict(size=12, color="white",),  # label font properties
        ),
        colorbar=dict(title="loss", titleside="right",),
        hovertemplate="loss: %{z}<br>n_estimators: %{x}<br>eta: %{y}<extra></extra>",
    )
)

fig.update_layout(
    xaxis_title="n_estimators",
    yaxis_title=r'learning rate (eta)',
    title={
        "text": r'n_estimators vs. learning rate (eta) | 100 evaluations',
        "xanchor": "center",
        "yanchor": "top",
        "x": 0.5,
    },
)
pio.write_image(fig, 'results/classifier_opt/stai3030_n_estimators_vs_eta.pdf', format='pdf', width=1000, height=400)

In [97]:
# plotly express does not support contour plots so we will use `graph_objects` instead. `go.Contour
# automatically interpolates "z" values for our loss.
fig = go.Figure(
    data=go.Contour(
        z=trials_df_xgb["loss"],
        x=trials_df_xgb["gamma"],
        y=trials_df_xgb["max_depth"],
        contours=dict(
            showlabels=True,  # show labels on contours
            labelfont=dict(size=12, color="white",),  # label font properties
        ),
        colorbar=dict(title="loss", titleside="right",),
        hovertemplate="loss: %{z}<br>gamma: %{x}<br>max_depth: %{y}<extra></extra>",
    )
)

fig.update_layout(
    xaxis_title="gamma",
    yaxis_title="max_depth",
    title={
        "text": "gamma vs. max_depth | 100 evaluations",
        "xanchor": "center",
        "yanchor": "top",
        "x": 0.5,
    },
)
pio.write_image(fig, 'results/classifier_opt/stai3030_gamma_vs_max_depth.pdf', format='pdf', width=1000, height=400)

In [98]:
# plotly express does not support contour plots so we will use `graph_objects` instead. `go.Contour
# automatically interpolates "z" values for our loss.
fig = go.Figure(
    data=go.Contour(
        z=trials_df_xgb["loss"],
        x=trials_df_xgb["colsample_bytree"],
        y=trials_df_xgb["min_child_weight"],
        contours=dict(
            showlabels=True,  # show labels on contours
            labelfont=dict(size=12, color="white",),  # label font properties
        ),
        colorbar=dict(title="loss", titleside="right",),
        hovertemplate="loss: %{z}<br>colsample_bytree: %{x}<br>min_child_weight: %{y}<extra></extra>",
    )
)

fig.update_layout(
    xaxis_title="colsample_bytree",
    yaxis_title="min_child_weight",
    title={
        "text": "colsample_bytree vs. min_child_weight | 100 evaluations",
        "xanchor": "center",
        "yanchor": "top",
        "x": 0.5,
    },
)
pio.write_image(fig, 'results/classifier_opt/stai3030_colsample_bytree_vs_min_child_weight.pdf', format='pdf', width=1000, height=400)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
def objective(space, nsplits=n_splits):
    y_ts = []
    y_ps = []
    for i in range(nsplits):
        clf=KNeighborsClassifier(
                    n_neighbors =int(space['n_neighbors']), p = int(space['d']), n_jobs=-1)

        scaler = StandardScaler()
        x_train_s = scaler.fit_transform(x_train_scatter[i])
        x_val_s = scaler.transform(x_val_scatter[i])
        clf.fit(x_train_s, y_train_scatter[i])
        y_true, y_pred = cvc.L_predictor(clf, L, 2**J, x_val_s, y_val_scatter[i])
        y_ts += list(y_true)
        y_ps += list(y_pred)
        
    print ('SCORE:', accuracy_score(y_ts, y_ps))

    return {'loss': -accuracy_score(y_ts, y_ps), 'status': STATUS_OK }

trials_knn = Trials()

best_hyperparams_knn = fmin(fn = objective,
                        space = knn_space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials_knn)

SCORE:                                                
0.8117647058823529                                    
SCORE:                                                                              
0.8117647058823529                                                                  
SCORE:                                                                              
0.8352941176470589                                                                    
SCORE:                                                                                  
0.8117647058823529                                                                      
SCORE:                                                                                  
0.8117647058823529                                                                      
SCORE:                                                                                  
0.8117647058823529                                                                      
SCORE:        

In [8]:
import pandas as pd
# This is a simple helper function that allows us to fill in `np.nan` when a particular
# hyperparameter is not relevant to a particular trial.
def unpack(x):
    if x:
        return x[0]
    return np.nan


# We'll first turn each trial into a series and then stack those series together as a dataframe.
trials_df_knn = pd.DataFrame([pd.Series(t["misc"]["vals"]).apply(unpack) for t in trials_knn])
# Then we'll add other relevant bits of information to the correct rows and perform a couple of
# mappings for convenience
trials_df_knn["loss"] = [t["result"]["loss"] for t in trials_knn]
trials_df_knn["trial_number"] = trials_df_knn.index



In [9]:
pd.to_pickle(trials_df_knn, 'results/classifier_opt_trials/stai3030_knn_opt_30_4_12.pkl')

In [99]:
trials_df_knn = pd.read_pickle('results/classifier_opt_trials/stai3520_knn_opt_10_3_12.pkl')

In [100]:
trials_df_knn[trials_df_knn['loss'].min() == trials_df_knn['loss']]

Unnamed: 0,d,n_neighbors,loss,trial_number
47,2.0,4.0,-0.907734,47


In [101]:
from plotly import graph_objects as go
# plotly express does not support contour plots so we will use `graph_objects` instead. `go.Contour
# automatically interpolates "z" values for our loss.
fig = go.Figure(
    data=go.Contour(
        z=trials_df_knn["loss"],
        x=trials_df_knn["n_neighbors"],
        y=trials_df_knn["d"],
        contours=dict(
            showlabels=True,  # show labels on contours
            labelfont=dict(size=12, color="white",),  # label font properties
        ),
        colorbar=dict(title="loss", titleside="right",),
        hovertemplate="loss: %{z}<br>n_neighbors: %{x}<br>d: %{y}<extra></extra>",
    )
)

fig.update_layout(
    xaxis_title="n_neighbors",
    yaxis_title="p",
    title={
        "text": "n_neighbors vs. p | 50 evaluations",
        "xanchor": "center",
        "yanchor": "top",
        "x": 0.5,
    },
)
pio.write_image(fig, 'results/classifier_opt/stai3520_n_neighbors_vs_p.pdf', format='pdf', width=1000, height=400)

In [16]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams_knn)

The best hyperparameters are :  

{'d': 1.0, 'n_neighbors': 18.0}
