In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots
import shap

import events_package.utils as utils
from events_package.Experiment import Experiment
from events_package.config import FIVE_LAYERS
from events_package.input_getters import get_Y_1, get_X_3, get_X_4, get_X_5

In [2]:
Experiment.__version__

'5.0'

# 1. Importing Single Particle Data

## 1.1 Electrons

In [3]:
# importing data
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Electron\Parquet\1m_electron_pq_3"
)

electrons = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
electrons.length

444840

In [4]:
electrons.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 434998
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 434998


## 1.2 Photons

In [5]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Photon\Parquet\1m_photon_pq"
)

photons = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
photons.length

444142

In [6]:
photons.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 434870
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 434870


## 1.3 Neutral Pions

In [7]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiZero\Parquet\pq_pi0_2"
)

pi0 = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
pi0.length

412856

In [8]:
pi0.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 391483
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 391483


## 1.4 Charged Pions

In [9]:
dataset_df = pd.read_parquet(
    r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiPlusMinus\Parquet\pq_piplusminus_2"
)

pi_char = Experiment(dataset_df, config=FIVE_LAYERS)
del dataset_df
pi_char.length

357554

In [10]:
pi_char.standard_procedure()

INFO: Removed duplicates
INFO: Denoisified the dataset
INFO: Shuffled dataset
INFO: Number of events after removing duplicates: 330809
INFO: Removed events with 0 energy in layers after denoisifying
INFO: Number of events after removing 0 energy (in calorimeters) events: 330809


# AutoGluon Stuff

In [11]:
# add types to allow for identification later
electrons.add_physics_object_type(typ="electron")
photons.add_physics_object_type(typ="photon")
pi0.add_physics_object_type(typ="pi0")
pi_char.add_physics_object_type(typ="pi_char")

experiment = electrons + photons + pi0 + pi_char

# all previous datasets have already been denoisified, duplicates were removed, no need to do it now
# in fact, doing it would delete some good events
experiment.shuffle_dataset(repeats=11)
print(experiment.length)

1592160


In [12]:
experiment.train_test_split(get_X=get_X_5, get_Y=get_Y_1, test_size=0.2)
experiment.X_test.shape

(318433, 22)

In [13]:
feature_names = [
    "psb eta",
    "emb1 eta",
    "emb2 eta",
    "emb3 eta",
    "hab1 eta",
    "psb - emb1 eta",
    "emb1 - emb2 eta",
    "emb2 - emb3 eta",
    "emb3 - hab1 eta",
    "psb(4)",
    "psb(5) - psb(3)",
    "emb1(8)",
    "emb1(9) - emb1(7)",
    "emb1(10) - emb1(6)",
    "emb1(11) - emb1(5)",
    "emb1(12) - emb1(4)",
    "emb2(4)",
    "emb2(5) - emb2(3)",
    "emb3(4)",
    "emb3(5) - emb3(3)",
    "hab1(4)",
    "hab1(5) - hab1(3)",
]

In [15]:
X_train_df = pd.DataFrame(experiment.X_train, columns=feature_names)

y_train_df = pd.DataFrame({"target z": experiment.y_train})


X_test_df = pd.DataFrame(experiment.X_test, columns=feature_names)
y_test_df = pd.DataFrame({"target z": experiment.y_test})

In [16]:
df_train = pd.concat([y_train_df, X_train_df], axis=1)
df_test = pd.concat([y_test_df, X_test_df], axis=1)


df_train

Unnamed: 0,target z,psb eta,emb1 eta,emb2 eta,emb3 eta,hab1 eta,psb - emb1 eta,emb1 - emb2 eta,emb2 - emb3 eta,emb3 - hab1 eta,...,emb1(9) - emb1(7),emb1(10) - emb1(6),emb1(11) - emb1(5),emb1(12) - emb1(4),emb2(4),emb2(5) - emb2(3),emb3(4),emb3(5) - emb3(3),hab1(4),hab1(5) - hab1(3)
0,-45.091316,0.037769,0.045334,0.037769,0.125896,0.050358,-0.007565,0.007565,-0.088127,0.075538,...,-0.029315,-0.044543,-0.043268,-0.015717,0.475063,0.082339,0.002248,-0.001212,0.000811,-0.000026
1,176.684330,0.541353,0.520556,0.516173,0.478405,0.453225,0.020797,0.004383,0.037769,0.025179,...,0.046334,0.012686,0.002812,0.003339,0.486293,0.009775,0.006022,0.002735,0.002877,0.001288
2,-63.061062,1.120474,1.136469,1.145653,1.133063,1.258959,-0.015995,-0.009184,0.012590,-0.125896,...,-0.000214,0.000065,-0.000709,0.000276,0.315953,0.085330,0.326566,0.121505,0.036563,-0.024485
3,-173.374100,1.170832,1.170860,1.095295,1.233780,0.956809,-0.000028,0.075565,-0.138486,0.276971,...,-0.016119,0.009665,0.000286,0.000133,0.107941,0.029455,0.014408,-0.018446,0.006411,0.000000
4,120.654420,-0.062948,-0.057840,-0.062948,-0.075538,0.050358,-0.005108,0.005108,0.012590,-0.125896,...,-0.032556,-0.005342,0.000229,-0.000055,0.603596,-0.077539,0.009042,-0.000526,0.000779,-0.001150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273723,127.737980,0.415457,0.458027,0.465815,0.377688,0.453225,-0.042570,-0.007788,0.088127,-0.075538,...,0.010701,0.003224,0.002452,0.002756,0.445937,-0.187873,0.010550,0.000100,0.033475,0.003628
1273724,7.522919,1.170832,1.161481,1.170832,1.133063,1.258959,0.009351,-0.009351,0.037769,-0.125896,...,-0.025936,-0.011580,-0.011892,-0.001611,0.265940,-0.047849,0.014482,0.007318,0.007462,-0.004059
1273725,-177.516710,0.088127,0.104736,0.113306,0.125896,0.151075,-0.016609,-0.008570,-0.012590,-0.025179,...,0.018396,-0.006275,-0.008383,-0.003583,0.386705,0.006417,0.002460,0.002405,0.002829,-0.000077
1273726,-94.686325,-0.390277,-0.386118,-0.390277,-0.377688,-0.352509,-0.004159,0.004159,-0.012590,-0.025179,...,0.037750,0.007424,0.005210,-0.000592,0.459585,0.013276,0.006277,0.003087,0.004095,-0.000278


In [17]:
label_column = "target z"

predictor = TabularPredictor(label=label_column)

No path specified. Models will be saved in: "AutogluonModels\ag-20240402_082649"


In [18]:
predictor.fit(train_data=df_train)

# Make predictions on the test set
predictions = predictor.predict(df_test.drop(label_column, axis=1))

# Evaluate the model performance on the test set
performance = predictor.evaluate(df_test)
print(performance)

No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240402_082649"
AutoGluon Version:  1.0.0
Python Version:     3.9.15
Operating System:   Windows
Platform Machine:   AMD64


[1000]	valid_set's rmse: 63.616
[2000]	valid_set's rmse: 61.9395
[3000]	valid_set's rmse: 61.1373
[4000]	valid_set's rmse: 60.6378
[5000]	valid_set's rmse: 60.2765
[6000]	valid_set's rmse: 60.0311
[7000]	valid_set's rmse: 59.834
[8000]	valid_set's rmse: 59.6617
[9000]	valid_set's rmse: 59.5157
[10000]	valid_set's rmse: 59.3915


	-59.3892	 = Validation score   (-root_mean_squared_error)
	224.72s	 = Training   runtime
	1.44s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 62.4409
[2000]	valid_set's rmse: 61.1779
[3000]	valid_set's rmse: 60.6695
[4000]	valid_set's rmse: 60.2331
[5000]	valid_set's rmse: 59.9747
[6000]	valid_set's rmse: 59.7677
[7000]	valid_set's rmse: 59.6149
[8000]	valid_set's rmse: 59.5335
[9000]	valid_set's rmse: 59.4965
[10000]	valid_set's rmse: 59.3991


	-59.3982	 = Validation score   (-root_mean_squared_error)
	209.85s	 = Training   runtime
	1.29s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-64.4766	 = Validation score   (-root_mean_squared_error)
	1426.44s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: CatBoost ...
	-59.3255	 = Validation score   (-root_mean_squared_error)
	1164.46s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-68.2971	 = Validation score   (-root_mean_squared_error)
	309.59s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-60.3972	 = Validation score   (-root_mean_squared_error)
	646.55s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: XGBoost ...
	-59.2478	 = Validation score   (-root_mean_squared_error)
	216.69s	 = Training   runtime
	0.48s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-60.3996	 = Validation score   (-root_mean_squared_error)
	715.71s	 = Training   runtime
	0

[1000]	valid_set's rmse: 60.7801
[2000]	valid_set's rmse: 60.0487
[3000]	valid_set's rmse: 59.7223
[4000]	valid_set's rmse: 59.5201
[5000]	valid_set's rmse: 59.3718
[6000]	valid_set's rmse: 59.2744
[7000]	valid_set's rmse: 59.2222
[8000]	valid_set's rmse: 59.1775
[9000]	valid_set's rmse: 59.1106
[10000]	valid_set's rmse: 59.0737


	-59.0728	 = Validation score   (-root_mean_squared_error)
	246.35s	 = Training   runtime
	1.79s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'XGBoost': 0.316, 'LightGBMLarge': 0.266, 'NeuralNetTorch': 0.241, 'NeuralNetFastAI': 0.114, 'LightGBMXT': 0.051, 'LightGBM': 0.013}
	-58.1339	 = Validation score   (-root_mean_squared_error)
	0.36s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 5214.76s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240402_082649")


{'root_mean_squared_error': -57.87115462716966, 'mean_squared_error': -3349.07053788178, 'mean_absolute_error': -38.31708831373893, 'r2': 0.7873698805270859, 'pearsonr': 0.8878195738314227, 'median_absolute_error': -23.72511707861328}


In [19]:
# Display the leaderboard
leaderboard = predictor.leaderboard(df_test)
print(leaderboard)

leaderboard

                  model  score_test   score_val              eval_metric  \
0   WeightedEnsemble_L2  -57.871155  -58.133866  root_mean_squared_error   
1         LightGBMLarge  -58.730404  -59.072841  root_mean_squared_error   
2              CatBoost  -58.932564  -59.325504  root_mean_squared_error   
3              LightGBM  -59.090490  -59.398212  root_mean_squared_error   
4            LightGBMXT  -59.174211  -59.389178  root_mean_squared_error   
5               XGBoost  -59.208393  -59.247755  root_mean_squared_error   
6        NeuralNetTorch  -59.903863  -60.399632  root_mean_squared_error   
7       NeuralNetFastAI  -60.082299  -60.397175  root_mean_squared_error   
8       RandomForestMSE  -64.240513  -64.476603  root_mean_squared_error   
9         ExtraTreesMSE  -68.020690  -68.297059  root_mean_squared_error   
10       KNeighborsDist -109.734522 -109.710246  root_mean_squared_error   
11       KNeighborsUnif -110.762539 -110.878569  root_mean_squared_error   

    pred_ti

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-57.871155,-58.133866,root_mean_squared_error,138.785624,5.13722,2260.220623,0.027658,0.0,0.360862,2,True,12
1,LightGBMLarge,-58.730404,-59.072841,root_mean_squared_error,50.214984,1.790067,246.350291,50.214984,1.790067,246.350291,1,True,11
2,CatBoost,-58.932564,-59.325504,root_mean_squared_error,0.639843,0.019101,1164.459247,0.639843,0.019101,1164.459247,1,True,6
3,LightGBM,-59.09049,-59.398212,root_mean_squared_error,34.394555,1.285251,209.847011,34.394555,1.285251,209.847011,1,True,4
4,LightGBMXT,-59.174211,-59.389178,root_mean_squared_error,38.791308,1.442738,224.715218,38.791308,1.442738,224.715218,1,True,3
5,XGBoost,-59.208393,-59.247755,root_mean_squared_error,11.426803,0.475997,216.691715,11.426803,0.475997,216.691715,1,True,9
6,NeuralNetTorch,-59.903863,-60.399632,root_mean_squared_error,1.473105,0.063981,715.709211,1.473105,0.063981,715.709211,1,True,10
7,NeuralNetFastAI,-60.082299,-60.397175,root_mean_squared_error,2.457211,0.079187,646.546315,2.457211,0.079187,646.546315,1,True,8
8,RandomForestMSE,-64.240513,-64.476603,root_mean_squared_error,3.96062,0.152567,1426.44183,3.96062,0.152567,1426.44183,1,True,5
9,ExtraTreesMSE,-68.02069,-68.297059,root_mean_squared_error,3.930281,0.154314,309.593023,3.930281,0.154314,309.593023,1,True,7


In [20]:
utils.save_table_df(leaderboard, filename="9.0-autogluon.csv")

In [29]:
predictor.features()

['psb eta',
 'emb1 eta',
 'emb2 eta',
 'emb3 eta',
 'hab1 eta',
 'psb - emb1 eta',
 'emb1 - emb2 eta',
 'emb2 - emb3 eta',
 'emb3 - hab1 eta',
 'psb(4)',
 'psb(5) - psb(3)',
 'emb1(8)',
 'emb1(9) - emb1(7)',
 'emb1(10) - emb1(6)',
 'emb1(11) - emb1(5)',
 'emb1(12) - emb1(4)',
 'emb2(4)',
 'emb2(5) - emb2(3)',
 'emb3(4)',
 'emb3(5) - emb3(3)',
 'hab1(4)',
 'hab1(5) - hab1(3)']

In [30]:
predictor.model_names()

['KNeighborsUnif',
 'KNeighborsDist',
 'LightGBMXT',
 'LightGBM',
 'RandomForestMSE',
 'CatBoost',
 'ExtraTreesMSE',
 'NeuralNetFastAI',
 'XGBoost',
 'NeuralNetTorch',
 'LightGBMLarge',
 'WeightedEnsemble_L2']

In [40]:
i = 8
all_models = predictor.model_names()
model_to_use = all_models[i]

specific_model = predictor._trainer.load_model(model_to_use)

model_info = specific_model.get_info()
model_info

{'name': 'XGBoost',
 'model_type': 'XGBoostModel',
 'problem_type': 'regression',
 'eval_metric': 'root_mean_squared_error',
 'stopping_metric': 'root_mean_squared_error',
 'fit_time': 216.69171476364136,
 'num_classes': None,
 'quantile_levels': None,
 'predict_time': 0.4759969711303711,
 'val_score': -59.247754859463306,
 'hyperparameters': {'n_estimators': 10000,
  'learning_rate': 0.1,
  'n_jobs': -1,
  'proc.max_category_levels': 100,
  'objective': 'reg:squarederror',
  'booster': 'gbtree'},
 'hyperparameters_fit': {'n_estimators': 3832},
 'hyperparameters_nondefault': [],
 'ag_args_fit': {'max_memory_usage_ratio': 1.0,
  'max_time_limit_ratio': 1.0,
  'max_time_limit': None,
  'min_time_limit': 0,
  'valid_raw_types': ['bool', 'int', 'float', 'category'],
  'valid_special_types': None,
  'ignored_type_group_special': None,
  'ignored_type_group_raw': None,
  'get_features_kwargs': None,
  'get_features_kwargs_extra': None,
  'predict_1_batch_size': None,
  'temperature_scalar': 