### Import Modules

In [2]:
from datetime import datetime
import numpy as np
import pandas as pd
import shap
import pickle
import ppscore as pps
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import shapiro, normaltest
from tpot import TPOTClassifier
from hpsklearn import HyperoptEstimator, pca, min_max_scaler, standard_scaler
from hpsklearn import xgboost_classification, random_forest, ada_boost, gradient_boosting, extra_trees
from hpsklearn import svc, svc_linear, svc_rbf, svc_poly, svc_sigmoid, liblinear_svc
from hpsklearn import any_classifier
from hpsklearn import any_preprocessing
from hyperopt import tpe, hp
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import optuna
from bayes_opt import BayesianOptimization
from skopt import BayesSearchCV
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report,confusion_matrix,plot_confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import RFE, SelectFromModel, RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif
from matplotlib import pyplot

import warnings
warnings.filterwarnings('ignore')



WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


### Load and Clean Data

In [3]:
df_train = pd.read_csv('Data/higgs_boson_training.csv')

In [4]:
df_train.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [6]:
df_train.shape

(250000, 33)

In [7]:
df_train.isnull().sum().sum()

0

In [8]:
# calculate duplicates
dups = df_train.duplicated()
#print(dups)
# report if there are any duplicates
print(dups.any())

False


In [9]:
df_train["Label"].value_counts()

b    164333
s     85667
Name: Label, dtype: int64

In [10]:
df_train['Label'] = df_train['Label'].map({'b':0,'s':1})
df_train["Label"].value_counts()

0    164333
1     85667
Name: Label, dtype: int64

In [11]:
s = df_train[df_train['Label']==1]
b = df_train[df_train['Label']==0]
outlier_fraction = len(s)/float(len(b))
print (outlier_fraction)

0.521301260245964


In [16]:
pps.matrix(df_train)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,EventId,EventId,1.000000,predict_itself,True,,0.000000,1.000000,
1,EventId,DER_mass_MMC,0.000000,regression,True,mean absolute error,196.062021,321.063250,DecisionTreeRegressor()
2,EventId,DER_mass_transverse_met_lep,0.000000,regression,True,mean absolute error,28.247978,37.901123,DecisionTreeRegressor()
3,EventId,DER_mass_vis,0.000000,regression,True,mean absolute error,24.561323,38.002895,DecisionTreeRegressor()
4,EventId,DER_pt_h,0.000000,regression,True,mean absolute error,41.882694,59.955338,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
1084,Label,PRI_jet_subleading_eta,0.000000,regression,True,mean absolute error,286.913230,398.773803,DecisionTreeRegressor()
1085,Label,PRI_jet_subleading_phi,0.000000,regression,True,mean absolute error,286.943100,398.815250,DecisionTreeRegressor()
1086,Label,PRI_jet_all_pt,0.000000,regression,True,mean absolute error,65.235753,68.575408,DecisionTreeRegressor()
1087,Label,Weight,0.343517,regression,True,mean absolute error,1.529808,1.004294,DecisionTreeRegressor()


In [11]:
X = df_train.drop(['Label'],axis=1)
y = df_train['Label']

In [12]:
#y = LabelEncoder().fit_transform(y)

### Train Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.30, stratify=y_train, random_state=42)

In [15]:
print(f"Train Data (X): {X_train.shape}")
print(f"Test Data (X): {X_test.shape}")
print(f"Validation Data (X): {X_valid.shape}")

Train Data (X): (122500, 32)
Test Data (X): (75000, 32)
Validation Data (X): (52500, 32)


In [16]:
print(f"Train Data (y): {y_train.shape}")
print(f"Test Data (y): {y_test.shape}")
print(f"Validation Data (y): {y_valid.shape}")

Train Data (y): (122500,)
Test Data (y): (75000,)
Validation Data (y): (52500,)


### Find Best Algorithm

#### Find Best Algorithm with Best Params Using HyperoptEstimator AutoML

In [16]:
init_time = datetime.now()
print (f"Job Started at: {init_time}")
accuracy = 0
best_model = None
#for i in range(100):
    #print (i)
model = HyperoptEstimator(  classifier= any_classifier('cla'), 
                                preprocessing= any_preprocessing('pre'), 
                                algo=tpe.suggest, 
                                max_evals=50, 
                                trial_timeout=5000)
    # perform the search
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
    #if acc > accuracy:
        #accuracy = acc
        #best_model = model
        #print (accuracy)

fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))
print (f"Job Ended at: {fin_time}")

Job Started at: 2020-11-26 22:28:19.114837
100%|██████████| 1/1 [05:46<00:00, 346.25s/trial, best loss: 0.0]
100%|██████████| 2/2 [00:01<00:00,  1.09s/trial, best loss: 0.0]
100%|██████████| 3/3 [01:03<00:00, 63.66s/trial, best loss: 0.0]
100%|██████████| 4/4 [00:12<00:00, 12.31s/trial, best loss: 0.0]
100%|██████████| 5/5 [00:05<00:00,  5.93s/trial, best loss: 0.0]
100%|██████████| 6/6 [07:38<00:00, 458.12s/trial, best loss: 0.0]
100%|██████████| 7/7 [03:38<00:00, 218.94s/trial, best loss: 0.0]
100%|██████████| 8/8 [03:11<00:00, 191.30s/trial, best loss: 0.0]
100%|██████████| 9/9 [00:00<00:00,  1.72trial/s, best loss: 0.0]
100%|██████████| 10/10 [1:23:20<00:00, 5000.12s/trial, best loss: 0.0]
100%|██████████| 11/11 [09:56<00:00, 596.09s/trial, best loss: 0.0]
100%|██████████| 12/12 [16:41<00:00, 1001.33s/trial, best loss: 0.0]
100%|██████████| 13/13 [04:35<00:00, 275.07s/trial, best loss: 0.0]
100%|██████████| 14/14 [01:30<00:00, 90.63s/trial, best loss: 0.0]
100%|██████████| 15/15 [0

job exception: Input contains NaN, infinity or a value too large for dtype('float64').


 98%|█████████▊| 46/47 [00:02<?, ?trial/s, best loss=?]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [18]:
#summarize the best model
print("Accuracy: %.3f" % acc)
print(model.best_model())

{'learner': XGBClassifier(base_score=0.5, booster=None,
              colsample_bylevel=0.6573364597182277, colsample_bynode=None,
              colsample_bytree=0.5102849665204783, gamma=0.0016299786974779509,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.0001052843797478166, max_delta_step=0,
              max_depth=5, min_child_weight=21, missing=nan,
              monotone_constraints=None, n_estimators=5000, n_jobs=None,
              num_parallel_tree=None, random_state=None,
              reg_alpha=0.004292169845548234, reg_lambda=2.788702137746418,
              scale_pos_weight=1, seed=1, subsample=0.9383008964042696,
              tree_method=None, validate_parameters=None, verbosity=None), 'preprocs': (MinMaxScaler(feature_range=(0.0, 1.0)),), 'ex_preprocs': ()}


In [20]:
init_time = datetime.now()
print (f"Job Started at: {init_time}")
xgb_model = XGBClassifier(base_score=0.5, booster=None,
              colsample_bylevel=0.6573364597182277, colsample_bynode=None,
              colsample_bytree=0.5102849665204783, gamma=0.0016299786974779509,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.0001052843797478166, max_delta_step=0,
              max_depth=5, min_child_weight=21, missing=None,
              monotone_constraints=None, n_estimators=5000, n_jobs=None,
              num_parallel_tree=None, random_state=None,
              reg_alpha=0.004292169845548234, reg_lambda=2.788702137746418,
              scale_pos_weight=1, seed=1, subsample=0.9383008964042696,
              tree_method=None, validate_parameters=None, verbosity=None)

pipeline = Pipeline(steps=[('norm', MinMaxScaler(feature_range=(0.0, 1.0))), ('m', xgb_model)])

fit_params = {
    'm__early_stopping_rounds':30,  
    'm__eval_set':[(X_valid, y_valid)],
    'm__verbose':0
}

pipeline.fit(X_train, y_train)# **fit_params)
prediction = pipeline.predict(X_test)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

Job Started at: 2020-11-27 10:53:56.897099
Execution time :  0:07:29.147834


In [22]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49300
           1       1.00      1.00      1.00     25700

    accuracy                           1.00     75000
   macro avg       1.00      1.00      1.00     75000
weighted avg       1.00      1.00      1.00     75000



### Save Model

In [27]:
# save the model to disk
filename = 'finalized_higgs_boson_model_1.0.sav'
pickle.dump(pipeline, open(filename, 'wb'))

### Load Model

In [28]:
init_time = datetime.now()
print (f"Job Started at: {init_time}")
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
prediction = loaded_model.predict(X_test)
print(classification_report(y_test, prediction))
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

Job Started at: 2020-11-27 11:12:44.504891
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49300
           1       1.00      1.00      1.00     25700

    accuracy                           1.00     75000
   macro avg       1.00      1.00      1.00     75000
weighted avg       1.00      1.00      1.00     75000

Execution time :  0:00:06.633323
