In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#Used to save and load models. Speculated to be faster for large models in
#https://machinelearningmastery.com/save-gradient-boosting-models-xgboost-python/
import joblib
#XGBoost itself
import xgboost as xgb

Note that classes/tools from sklearn are imported when needed

In [None]:
from IML2018_tools import *

In [None]:
!python --version

In [None]:
!ls

In [None]:
#train_file_name = 'train10000.npy'
train_file_name = 'train_full_Nhardest5.pickle'

Read in the file properly for different file formats

In [None]:
train_df = loadInputAsDF(train_file_name, n = None)

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
sns.heatmap(train_df.corr())

## EDA

In [None]:
sns.distplot(train_df['recojet_pt'])

In [None]:
#for var in ['recojet_pt', 'recojet_eta', 'recojet_phi', 'recojet_m',
#       'recojet_sd_pt', 'recojet_sd_eta', 'recojet_sd_phi', 'recojet_sd_m',
#       'n_constituents']:
#    print(var)
#    sns.jointplot(x='genjet_sd_m', y=var, data=train_df, kind='hex')

### Feature engineering (done in a dedicated notebook now) and drop some columns

In [None]:
dropColumns(train_df, printColumns=True)

### Split and normalise

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop('genjet_sd_m', axis=1), train_df['genjet_sd_m'] , test_size=0.30, random_state=314)

## Run only if you want to keep jets between 5 and 7 TeV in the test sample

In [None]:
#Xtest5to7 = (X_test['recojet_pt'] > 5000) & (X_test['recojet_pt'] < 7000)

In [None]:
#X_test = X_test[Xtest5to7]
#y_test = y_test[Xtest5to7]

In [None]:
sns.heatmap(X_train.corr())

In [None]:
from sklearn.decomposition import PCA
pca_trans = PCA()
pca_trans.fit(X_train)
X_train_pca = pca_trans.transform(X_train)
X_test_pca = pca_trans.transform(X_test)

In [None]:
sns.heatmap(pd.DataFrame(X_test_pca).corr())

# XGBoost regressor

## Build the XGBoost model and define the metric

In [None]:
#preliminary parameters. will be fine-tuned in the GridSearch
xgb_params = {'max_depth': 5, 'learning_rate':0.1, 'n_estimators':100,
              'silent':1, 'random_state': 314, 'seed': 314, 'n_jobs':4}

In [None]:
clf = xgb.XGBRegressor(**xgb_params)

## Do a comparison of feature importance and extract the optimal number of trees

In [None]:
modelfit(clf, X_train, y_train, early_stopping_rounds=10)

## GridSearch to determine the optimal parameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import make_scorer

The next cell is CPU intense! do not try it on the full dataset!!!!!

In [None]:
param_test1 = {'max_depth': [3,5,7],
               'min_child_weight': [1,3],
               'gamma': [0,1e-3,1e-1],
               'subsample': [0.6,0.8,1],
               'colsample_bytree':[0.6,0.8,1],
               'reg_alpha':[0, 1e-3, 1e-1],
               'reg_lambda':[1, 1e-1, 1e-3]}
gs1 = GridSearchCV(estimator=clf, param_grid=param_test1, 
                   scoring=make_scorer(evaluate_loss, greater_is_better=False),
                   n_jobs=4, cv=5)
gs1.fit(X_train, y_train)
print(gs1.best_params_)
print(gs1.best_score_)
print(gs1.grid_scores_)

In [None]:
gs1.cv_results_

In [None]:
cv_results=[]

In [None]:
param_test_list = [{'max_depth': [3,5,7],
               'min_child_weight': [1,3]},
               {'gamma': [0,1e-3,1e-1]},
               {'subsample': [0.6,0.8,1],
               'colsample_bytree':[0.6,0.8,1]},
               {'reg_alpha':[0, 1e-3, 1e-1],
               'reg_lambda':[1, 1e-1, 1e-3]}]
for param_test in param_test_list:
    gs1 = GridSearchCV(estimator=clf, param_grid=param_test, 
                       scoring=make_scorer(evaluate_loss, greater_is_better=False),
                       n_jobs=4, cv=5,
                       verbose=True)
    gs1.fit(X_train, y_train)
    print(gs1.best_params_)
    print(gs1.best_score_)
    cv_results.append(gs1.cv_results_)
    #print(gs1.cv_results_)
    clf.set_params(**(gs1.best_params_))

## Save the optimised clf object

In [None]:
#joblib.dump(gs1, "gs1_2it.joblib.dat")

In [None]:
xgb_opt_ref1 = gs1.best_estimator_.get_params()
xgb_opt_ref1

## Second iteration of GS

In [None]:
param_test_list = [{'max_depth': [7, 9, 11],
               'min_child_weight': [1,3]},
               {'gamma': [0,1e-3,1e-1]},
               {'reg_alpha':[0, 1e-5],
               'reg_lambda':[1, 0.5,2]}]
clf.set_params(**xgb_opt_ref1)
for param_test in param_test_list:
    gs1 = GridSearchCV(estimator=clf, param_grid=param_test, 
                       scoring=make_scorer(evaluate_loss, greater_is_better=False),
                       n_jobs=4, cv=3,
                       verbose=True)
    gs1.fit(X_train, y_train)
    print(gs1.best_params_)
    print(gs1.best_score_)
    clf.set_params(**(gs1.best_params_))

## PCA train

In [None]:
param_test_list = [{'max_depth': [5, 7, 9, 11]},
               {'min_child_weight': [1,3]},
               {'gamma': [0,1e-3,1e-1]},
               {'reg_alpha':[0, 1e-5],
               'reg_lambda':[1]}]
clf.set_params(**xgb_opt_ref1)
for param_test in param_test_list:
    gs2 = GridSearchCV(estimator=clf, param_grid=param_test, 
                       scoring=make_scorer(evaluate_loss, greater_is_better=False),
                       n_jobs=4, cv=3,
                       verbose=True)
    gs2.fit(X_train_pca, y_train)
    print(gs2.best_params_)
    print(gs2.best_score_)
    clf.set_params(**(gs2.best_params_))

In [None]:
#joblib.dump(gs2, "gs2.joblib.dat")