### Import Modules

In [1]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, recall_score
from sklearn.metrics import classification_report,confusion_matrix,plot_confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.decomposition import PCA, TruncatedSVD
import mlflow
import mlflow.sklearn
#import mlflow.xgboost

import warnings
warnings.filterwarnings('ignore')

### Load and Clean Data

In [2]:
df_train = pd.read_csv('../Data/higgs_boson_training.csv')

In [3]:
df_train.shape

(250000, 33)

In [4]:
df_train["Label"].value_counts()

b    164333
s     85667
Name: Label, dtype: int64

In [5]:
df_train['Label'] = df_train['Label'].map({'b':0,'s':1})
df_train["Label"].value_counts()

0    164333
1     85667
Name: Label, dtype: int64

In [6]:
s = df_train[df_train['Label']==1]
b = df_train[df_train['Label']==0]
outlier_fraction = len(s)/float(len(b))
print (outlier_fraction)

0.521301260245964


In [7]:
X = df_train.drop(['Label'],axis=1)
y = df_train['Label']

### Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.30, stratify=y_train, random_state=42)

In [10]:
print(f"Train Data (X): {X_train.shape}")
print(f"Test Data (X): {X_test.shape}")
print(f"Validation Data (X): {X_valid.shape}")

Train Data (X): (122500, 32)
Test Data (X): (75000, 32)
Validation Data (X): (52500, 32)


In [11]:
print(f"Train Data (y): {y_train.shape}")
print(f"Test Data (y): {y_test.shape}")
print(f"Validation Data (y): {y_valid.shape}")

Train Data (y): (122500,)
Test Data (y): (75000,)
Validation Data (y): (52500,)


### MLflow PROJECT SETUP

In [12]:
# Setting the MLflow tracking server
#mlflow.set_tracking_uri('http://kubernetes.docker.internal:5000') 

In [13]:
# Setting the requried environment variables
#os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://mlflow-minio.local/'
#os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
#os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

### Parameters

In [19]:
params ={
    "base_score":0.5, 
    "booster":None,
    "colsample_bylevel":0.6573364597182277, 
    "colsample_bynode":None,
    "colsample_bytree":0.5102849665204783, 
    "gamma":0.0016299786974779509,
    "gpu_id":None, 
    "importance_type":'gain', 
    "interaction_constraints":None,
    "learning_rate":0.0001052843797478166, 
    "max_delta_step":0,
    "max_depth":5, 
    "min_child_weight":21, 
    "missing":None,
    "monotone_constraints":None, 
    "n_estimators":5000, 
    "n_jobs":None,
    "num_parallel_tree":None, 
    "random_state":None,
    "reg_alpha":0.004292169845548234, 
    "reg_lambda":2.788702137746418,
    "scale_pos_weight":1, 
    "seed":1, 
    "subsample":0.9383008964042696,
    "tree_method":None, 
    "validate_parameters":None, 
    "verbosity":None
}

### Running MLFlow script

In [21]:
with mlflow.start_run():
    xgb_model = XGBClassifier(**params)
    
    pipeline = Pipeline(steps=[('norm', MinMaxScaler(feature_range=(0.0, 1.0))), ('m', xgb_model)])
    
    fit_params = {
    'm__early_stopping_rounds':30,  
    'm__eval_set':[(X_valid, y_valid)],
    'm__verbose':0
    }
    
    pipeline.fit(X_train, y_train)# **fit_params)
    prediction = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, prediction)
    
    # Logging params and metrics to MLFlow
    #mlflow.log_param('learning_rate', 0.0001052843797478166)
    #mlflow.log_param('max_depth', 5)
    #mlflow.log_param('n_estimators', 5000)
    #mlflow.log_param('colsample_bytree', 0.5102849665204783)
    mlflow.log_params(params)
    mlflow.log_metric('accuracy', accuracy)
    
    # Logging training data
    mlflow.log_artifact(local_path = '../Data/higgs_boson_training.csv')
    
    # Logging training code
    mlflow.log_artifact(local_path = './higgs_boson_classification_local.ipynb')
    
    # Logging model to MLFlow
    mlflow.sklearn.log_model(pipeline, 'model')
    #mlflow.sklearn.log_model(sk_model = pipeline,
                             #artifact_path = 'higgs_boson_model',
                             #registered_model_name = 'higgs_boson_model')

### Save Model

In [16]:
# save the model to disk
joblib.dump(pipeline, 'model/higgs_boson_model.pkl')

['model/higgs_boson_model.pkl']