In [1]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots
import shap

import events_package.utils as utils
from events_package.Experiment import Experiment
from events_package.config import FIVE_LAYERS
from events_package.input_getters import get_Y_1, get_X_1, get_X_2

In [2]:
Experiment.__version__

'5.0'

# 1. Importing Single Particle Data

## 1.1 Electrons

In [3]:
# importing data
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Electron\Parquet\1m_electron_pq_3"
)

electrons = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
electrons.length

444840

In [4]:
electrons.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 434998
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 434998


## 1.2 Photons

In [5]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Photon\Parquet\1m_photon_pq"
)

photons = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
photons.length

444142

In [6]:
photons.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 434870
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 434870


## 1.3 Neutral Pions

In [7]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiZero\Parquet\pq_pi0_2"
)

pi0 = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
pi0.length

412856

In [8]:
pi0.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 391483
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 391483


## 1.4 Charged Pions

In [9]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiPlusMinus\Parquet\pq_piplusminus_2"
)

pi_char = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
pi_char.length

357554

In [10]:
pi_char.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 330809
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 330803


# 2. Model With Input Features Version 1

In [11]:
# hyperparameters used
params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "learning_rate": 0.18,
    "colsample_bytree": 0.8,
    "eval_metric": "rmse",
    "n_estimators": 600,
}

In [12]:
print(get_X_1.__doc__)


    Creates inputs for model training.
    Inputs include center etas, differences between center etas from different layers, and all energy information.
    All energy info means that all cells are passed as inputs in consistent ordering, without making any normalisations or changes.
    This type of model has all the information necessary to make predictions, but the inputs are not optimised in any way.
    Each single input is an array of 312 values (for single particle events).
    


In [13]:
# split electron data into training and testing, next train XGBoost model
electrons.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(electrons.X_test.shape)

electrons.train_xgboost_model(params)

(87000, 312)
INFO: Trained XGBoost model; mean squared error: 1399.408423379894


1399.408423379894

In [14]:
# split photon data into training and testing, next train XGBoost model
photons.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(photons.X_test.shape)

photons.train_xgboost_model(params)

(86975, 312)
INFO: Trained XGBoost model; mean squared error: 1486.9185939471365


1486.9185939471365

In [15]:
# split neutral pion data into training and testing, next train XGBoost model
pi0.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(pi0.X_test.shape)

pi0.train_xgboost_model(params)

(78297, 312)
INFO: Trained XGBoost model; mean squared error: 3053.5482601338167


3053.5482601338167

In [16]:
# split charged pion data into training and testing, next train XGBoost model
pi_char.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(pi_char.X_test.shape)

pi_char.train_xgboost_model(params)

(66161, 312)
INFO: Trained XGBoost model; mean squared error: 9179.373302859436


9179.373302859436

# 3 Model With Input Features Version 2

In [17]:
print(get_X_2.__doc__)


    Creates inputs for model training.
    Inputs include center etas, differences between center etas from different layers, and some energy information.
    Compared to get_X_2, this time a sum over phi axis is taken from calorimeter cell windows.
    The reasoning behind this is that the regression problem of finding z is independent of phi.
    This procedure greately decreases number of inputs, without any accuracy loss.
    Each single input is an array of 62 values (for single particle events).
    


In [18]:
# split electron data into training and testing, next train XGBoost model
electrons.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(electrons.X_test.shape)

electrons.train_xgboost_model(params)

(87000, 62)
INFO: Trained XGBoost model; mean squared error: 1352.5041306973285


1352.5041306973285

In [19]:
# split photon data into training and testing, next train XGBoost model
photons.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(photons.X_test.shape)

photons.train_xgboost_model(params)

(86975, 62)
INFO: Trained XGBoost model; mean squared error: 1436.7863637333537


1436.7863637333537

In [20]:
# split neutral pion data into training and testing, next train XGBoost model
pi0.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(pi0.X_test.shape)

pi0.train_xgboost_model(params)

(78297, 62)
INFO: Trained XGBoost model; mean squared error: 2988.789870055645


2988.789870055645

In [21]:
# split charged pion data into training and testing, next train XGBoost model
pi_char.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(pi_char.X_test.shape)

pi_char.train_xgboost_model(params)

(66161, 62)
INFO: Trained XGBoost model; mean squared error: 9007.713932503315


9007.713932503315

# 4. Hyperparameter Scan For Input Features Version 2

In [22]:
max_depth_range = [6, 7]
learning_rate_range = [0.18, 0.19]
colsample_bytree_range = [0.8, 0.9, 1.0]
num_rounds_grid = [500, 600, 700, 800]

# Generate all combinations of hyperparameters
param_combinations = list(
    itertools.product(
        max_depth_range, learning_rate_range, colsample_bytree_range, num_rounds_grid
    )
)

# Construct the param_grid
param_grid = [
    {
        "objective": "reg:squarederror",
        "max_depth": max_depth,
        "learning_rate": learning_rate,
        "colsample_bytree": colsample_bytree,
        "n_estimators": n_estimators,
    }
    for max_depth, learning_rate, colsample_bytree, n_estimators in param_combinations
]


# Perform hyperparameter scan
electrons_hyperparams = electrons.xgboost_hyperparameter_scan(
    param_grid, nodes_info=True
)

photons_hyperparams = photons.xgboost_hyperparameter_scan(param_grid, nodes_info=True)

pi0_hyperparams = pi0.xgboost_hyperparameter_scan(param_grid, nodes_info=True)

pi_char_hyperparams = pi_char.xgboost_hyperparameter_scan(param_grid, nodes_info=True)

INFO: Trained XGBoost model; mean squared error: 1369.3177678287109
INFO: Trained XGBoost model; mean squared error: 1352.5041306973285
INFO: Trained XGBoost model; mean squared error: 1342.9511925721413
INFO: Trained XGBoost model; mean squared error: 1333.0176377128073
INFO: Trained XGBoost model; mean squared error: 1370.0152532402865
INFO: Trained XGBoost model; mean squared error: 1352.5684490170045
INFO: Trained XGBoost model; mean squared error: 1341.7616150550773
INFO: Trained XGBoost model; mean squared error: 1334.5750847765974
INFO: Trained XGBoost model; mean squared error: 1370.9426941770419
INFO: Trained XGBoost model; mean squared error: 1355.0550787893312
INFO: Trained XGBoost model; mean squared error: 1343.8513777144035
INFO: Trained XGBoost model; mean squared error: 1335.739659951696
INFO: Trained XGBoost model; mean squared error: 1373.8040377965933
INFO: Trained XGBoost model; mean squared error: 1358.181949744477
INFO: Trained XGBoost model; mean squared error: 1

In [24]:
electrons_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
3,1333.017638,reg:squarederror,6,0.18,0.8,800,47825
7,1334.575085,reg:squarederror,6,0.18,0.9,800,47824
11,1335.73966,reg:squarederror,6,0.18,1.0,800,47871
39,1337.635752,reg:squarederror,7,0.19,0.8,800,90693
15,1339.829827,reg:squarederror,6,0.19,0.8,800,48011
31,1340.429725,reg:squarederror,7,0.18,0.9,800,91067
38,1340.626789,reg:squarederror,7,0.19,0.8,700,80003
35,1340.884862,reg:squarederror,7,0.18,1.0,800,90109
6,1341.761615,reg:squarederror,6,0.18,0.9,700,41995
2,1342.951193,reg:squarederror,6,0.18,0.8,700,42014


In [25]:
photons_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
11,1391.292465,reg:squarederror,6,0.18,1.0,800,47359
7,1398.696472,reg:squarederror,6,0.18,0.9,800,47424
27,1398.904562,reg:squarederror,7,0.18,0.8,800,90642
19,1399.382518,reg:squarederror,6,0.19,0.9,800,47202
15,1402.699328,reg:squarederror,6,0.19,0.8,800,47538
35,1403.071541,reg:squarederror,7,0.18,1.0,800,89585
31,1404.389515,reg:squarederror,7,0.18,0.9,800,89416
26,1404.680308,reg:squarederror,7,0.18,0.8,700,79645
39,1404.714792,reg:squarederror,7,0.19,0.8,800,90643
23,1405.011497,reg:squarederror,6,0.19,1.0,800,47146


In [26]:
pi0_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
27,2874.710473,reg:squarederror,7,0.18,0.8,800,89596
39,2882.107084,reg:squarederror,7,0.19,0.8,800,89850
26,2887.087193,reg:squarederror,7,0.18,0.8,700,78832
47,2888.409479,reg:squarederror,7,0.19,1.0,800,88848
31,2889.616198,reg:squarederror,7,0.18,0.9,800,89221
35,2890.521389,reg:squarederror,7,0.18,1.0,800,88970
38,2893.363886,reg:squarederror,7,0.19,0.8,700,78987
43,2898.066641,reg:squarederror,7,0.19,0.9,800,88740
46,2902.846788,reg:squarederror,7,0.19,1.0,700,78396
34,2903.179431,reg:squarederror,7,0.18,1.0,700,78108


In [27]:
pi_char_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
18,8959.147102,reg:squarederror,6,0.19,0.9,700,40788
10,8961.867258,reg:squarederror,6,0.18,1.0,700,40436
11,8962.586405,reg:squarederror,6,0.18,1.0,800,46276
22,8964.976521,reg:squarederror,6,0.19,1.0,700,40530
19,8966.851516,reg:squarederror,6,0.19,0.9,800,46548
23,8969.947562,reg:squarederror,6,0.19,1.0,800,46335
9,8974.537239,reg:squarederror,6,0.18,1.0,600,34844
17,8976.059672,reg:squarederror,6,0.19,0.9,600,34871
32,8976.429122,reg:squarederror,7,0.18,1.0,500,54337
33,8977.140012,reg:squarederror,7,0.18,1.0,600,65203
