In [1]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots
import shap

import events_package.utils as utils
from events_package.Experiment import Experiment
from events_package.config import FIVE_LAYERS
from events_package.input_getters import get_Y_1, get_X_1, get_X_2

In [2]:
Experiment.__version__

'5.0'

# 1. Importing Single Particle Data

## 1.1 Electrons

In [3]:
# importing data
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Electron\Parquet\1m_electron_pq_3"
)

electrons = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
electrons.length

444840

In [4]:
electrons.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 434998
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 434998


## 1.2 Photons

In [5]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Photon\Parquet\1m_photon_pq"
)

photons = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
photons.length

444142

In [6]:
photons.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 434870
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 434870


## 1.3 Neutral Pions

In [7]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiZero\Parquet\pq_pi0_2"
)

pi0 = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
pi0.length

412856

In [8]:
pi0.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 391483
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 391483


## 1.4 Charged Pions

In [9]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiPlusMinus\Parquet\pq_piplusminus_2"
)

pi_char = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
pi_char.length

357554

In [10]:
pi_char.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 330809
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 330809


# 2. Model With Input Features Version 1

In [11]:
# hyperparameters used
params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "learning_rate": 0.18,
    "colsample_bytree": 0.8,
    "eval_metric": "rmse",
    "n_estimators": 600,
}

In [12]:
print(get_X_1.__doc__)


    Creates inputs for model training.
    Inputs include center etas, differences between center etas from different layers, and all energy information.
    All energy info means that all cells are passed as inputs in consistent ordering, without making any normalisations or changes.
    This type of model has all the information necessary to make predictions, but the inputs are not optimised in any way.
    Each single input is an array of 312 values (for single particle events).
    


In [13]:
# split electron data into training and testing, next train XGBoost model
electrons.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(electrons.X_test.shape)

electrons.train_xgboost_model(params)  # 1399.408423379894

(87000, 312)
INFO: Trained XGBoost model; mean squared error: 1396.6998859688572


1396.6998859688572

In [14]:
# split photon data into training and testing, next train XGBoost model
photons.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(photons.X_test.shape)

photons.train_xgboost_model(params)

(86975, 312)
INFO: Trained XGBoost model; mean squared error: 1492.748128110236


1492.748128110236

In [15]:
# split neutral pion data into training and testing, next train XGBoost model
pi0.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(pi0.X_test.shape)

pi0.train_xgboost_model(params)

(78297, 312)
INFO: Trained XGBoost model; mean squared error: 3080.561237892581


3080.561237892581

In [16]:
# split charged pion data into training and testing, next train XGBoost model
pi_char.train_test_split(get_X=get_X_1, get_Y=get_Y_1, test_size=0.2)
print(pi_char.X_test.shape)

pi_char.train_xgboost_model(params)  # 9179.373302859436

(66162, 312)
INFO: Trained XGBoost model; mean squared error: 9116.682579837692


9116.682579837692

# 3 Model With Input Features Version 2

In [17]:
print(get_X_2.__doc__)


    Creates inputs for model training.
    Inputs include center etas, differences between center etas from different layers, and some energy information.
    Compared to get_X_2, this time a sum over phi axis is taken from calorimeter cell windows.
    The reasoning behind this is that the regression problem of finding z is independent of phi.
    This procedure greately decreases number of inputs, without any accuracy loss.
    Each single input is an array of 62 values (for single particle events).
    


In [18]:
# split electron data into training and testing, next train XGBoost model
electrons.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(electrons.X_test.shape)

electrons.train_xgboost_model(params)

(87000, 62)
INFO: Trained XGBoost model; mean squared error: 1336.2376481022725


1336.2376481022725

In [19]:
# split photon data into training and testing, next train XGBoost model
photons.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(photons.X_test.shape)

photons.train_xgboost_model(params)

(86975, 62)
INFO: Trained XGBoost model; mean squared error: 1417.4439615546664


1417.4439615546664

In [20]:
# split neutral pion data into training and testing, next train XGBoost model
pi0.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(pi0.X_test.shape)

pi0.train_xgboost_model(params)

(78297, 62)
INFO: Trained XGBoost model; mean squared error: 2980.5483081668035


2980.5483081668035

In [21]:
# split charged pion data into training and testing, next train XGBoost model
pi_char.train_test_split(get_X=get_X_2, get_Y=get_Y_1, test_size=0.2)
print(pi_char.X_test.shape)

pi_char.train_xgboost_model(params)

(66162, 62)
INFO: Trained XGBoost model; mean squared error: 8885.947316904048


8885.947316904048

# 4. Hyperparameter Scan For Input Features Version 2

In [22]:
max_depth_range = [6, 7]
learning_rate_range = [0.18, 0.19]
colsample_bytree_range = [0.8, 0.9, 1.0]
num_rounds_grid = [500, 600, 700, 800]

# Generate all combinations of hyperparameters
param_combinations = list(
    itertools.product(
        max_depth_range, learning_rate_range, colsample_bytree_range, num_rounds_grid
    )
)

# Construct the param_grid
param_grid = [
    {
        "objective": "reg:squarederror",
        "max_depth": max_depth,
        "learning_rate": learning_rate,
        "colsample_bytree": colsample_bytree,
        "n_estimators": n_estimators,
    }
    for max_depth, learning_rate, colsample_bytree, n_estimators in param_combinations
]


# Perform hyperparameter scan
electrons_hyperparams = electrons.xgboost_hyperparameter_scan(
    param_grid, nodes_info=True
)

photons_hyperparams = photons.xgboost_hyperparameter_scan(param_grid, nodes_info=True)

pi0_hyperparams = pi0.xgboost_hyperparameter_scan(param_grid, nodes_info=True)

pi_char_hyperparams = pi_char.xgboost_hyperparameter_scan(param_grid, nodes_info=True)

INFO: Trained XGBoost model; mean squared error: 1352.4761014183414
INFO: Trained XGBoost model; mean squared error: 1336.2376481022725
INFO: Trained XGBoost model; mean squared error: 1325.9025317502255
INFO: Trained XGBoost model; mean squared error: 1320.3198177685297
INFO: Trained XGBoost model; mean squared error: 1351.7911469985884
INFO: Trained XGBoost model; mean squared error: 1336.7741900252968
INFO: Trained XGBoost model; mean squared error: 1330.0130813009575
INFO: Trained XGBoost model; mean squared error: 1324.3713109461487
INFO: Trained XGBoost model; mean squared error: 1349.094380702262
INFO: Trained XGBoost model; mean squared error: 1332.8962864524717
INFO: Trained XGBoost model; mean squared error: 1325.718402949834
INFO: Trained XGBoost model; mean squared error: 1320.0297822033626
INFO: Trained XGBoost model; mean squared error: 1356.7184789721396
INFO: Trained XGBoost model; mean squared error: 1341.7746681418969
INFO: Trained XGBoost model; mean squared error: 1

In [23]:
electrons_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
11,1320.029782,reg:squarederror,6,0.18,1.0,800,48721
3,1320.319818,reg:squarederror,6,0.18,0.8,800,48816
19,1322.863615,reg:squarederror,6,0.19,0.9,800,48691
23,1322.910953,reg:squarederror,6,0.19,1.0,800,48675
7,1324.371311,reg:squarederror,6,0.18,0.9,800,48841
15,1325.378931,reg:squarederror,6,0.19,0.8,800,48976
10,1325.718403,reg:squarederror,6,0.18,1.0,700,42616
2,1325.902532,reg:squarederror,6,0.18,0.8,700,42746
22,1326.590894,reg:squarederror,6,0.19,1.0,700,42605
18,1328.036957,reg:squarederror,6,0.19,0.9,700,42667


In [24]:
photons_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
11,1391.456087,reg:squarederror,6,0.18,1.0,800,48536
3,1394.895423,reg:squarederror,6,0.18,0.8,800,48571
27,1398.466366,reg:squarederror,7,0.18,0.8,800,94384
7,1398.505271,reg:squarederror,6,0.18,0.9,800,48497
19,1399.972179,reg:squarederror,6,0.19,0.9,800,48611
26,1400.173207,reg:squarederror,7,0.18,0.8,700,82705
15,1401.842966,reg:squarederror,6,0.19,0.8,800,48678
25,1402.992199,reg:squarederror,7,0.18,0.8,600,71143
10,1403.168539,reg:squarederror,6,0.18,1.0,700,42542
2,1403.196762,reg:squarederror,6,0.18,0.8,700,42625


In [25]:
pi0_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
31,2892.679394,reg:squarederror,7,0.18,0.9,800,93274
27,2897.641006,reg:squarederror,7,0.18,0.8,800,93188
43,2903.345755,reg:squarederror,7,0.19,0.9,800,93277
30,2905.565329,reg:squarederror,7,0.18,0.9,700,81843
26,2909.283377,reg:squarederror,7,0.18,0.8,700,81474
35,2910.025408,reg:squarederror,7,0.18,1.0,800,93415
39,2913.77764,reg:squarederror,7,0.19,0.8,800,93553
42,2916.102536,reg:squarederror,7,0.19,0.9,700,81629
29,2919.275644,reg:squarederror,7,0.18,0.9,600,70328
34,2920.975817,reg:squarederror,7,0.18,1.0,700,81835


In [26]:
pi_char_hyperparams.sort_values(by="MSE")

Unnamed: 0,MSE,objective,max_depth,learning_rate,colsample_bytree,n_estimators,Nodes NO
1,8885.947317,reg:squarederror,6,0.18,0.8,600,36019
2,8887.299902,reg:squarederror,6,0.18,0.8,700,42106
0,8891.00772,reg:squarederror,6,0.18,0.8,500,30031
3,8892.868615,reg:squarederror,6,0.18,0.8,800,48131
14,8920.102867,reg:squarederror,6,0.19,0.8,700,42046
6,8920.971214,reg:squarederror,6,0.18,0.9,700,42141
5,8922.502803,reg:squarederror,6,0.18,0.9,600,36073
13,8929.499214,reg:squarederror,6,0.19,0.8,600,35973
15,8933.966574,reg:squarederror,6,0.19,0.8,800,48135
7,8934.119881,reg:squarederror,6,0.18,0.9,800,48175


In [31]:
utils.save_table_df(
    dataframe=electrons_hyperparams.sort_values(by="MSE"),
    filename="2.0-electrons-hyperparams.csv",
)

In [28]:
utils.save_table_df(
    dataframe=photons_hyperparams.sort_values(by="MSE"),
    filename="2.0-photons-hyperparams.csv",
)

In [29]:
utils.save_table_df(
    dataframe=pi0_hyperparams.sort_values(by="MSE"), filename="2.0-pi0-hyperparams.csv"
)

In [30]:
utils.save_table_df(
    dataframe=pi_char_hyperparams.sort_values(by="MSE"),
    filename="2.0-pi_char-hyperparams.csv",
)