In [1]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots
import shap

import events_package.utils as utils
from events_package.Experiment import Experiment
from events_package.config import FIVE_LAYERS
from events_package.input_getters import get_Y_1, get_X_5

In [2]:
Experiment.__version__

'5.0'

In [3]:
# hyperparameters used
params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "learning_rate": 0.18,
    "colsample_bytree": 0.8,
    "eval_metric": "rmse",
    "n_estimators": 600,
}

In [4]:
layers = FIVE_LAYERS.layers

threshold_range = range(0, 161, 20)

threshold_permutations = []
for threshold in threshold_range:
    threshold_dict = {layer: threshold for layer in layers}
    threshold_permutations.append(threshold_dict)

threshold_permutations

[{'psb': 0, 'emb1': 0, 'emb2': 0, 'emb3': 0, 'hab1': 0},
 {'psb': 20, 'emb1': 20, 'emb2': 20, 'emb3': 20, 'hab1': 20},
 {'psb': 40, 'emb1': 40, 'emb2': 40, 'emb3': 40, 'hab1': 40},
 {'psb': 60, 'emb1': 60, 'emb2': 60, 'emb3': 60, 'hab1': 60},
 {'psb': 80, 'emb1': 80, 'emb2': 80, 'emb3': 80, 'hab1': 80},
 {'psb': 100, 'emb1': 100, 'emb2': 100, 'emb3': 100, 'hab1': 100},
 {'psb': 120, 'emb1': 120, 'emb2': 120, 'emb3': 120, 'hab1': 120},
 {'psb': 140, 'emb1': 140, 'emb2': 140, 'emb3': 140, 'hab1': 140},
 {'psb': 160, 'emb1': 160, 'emb2': 160, 'emb3': 160, 'hab1': 160}]

In [5]:
threshold_permutations = threshold_permutations + [
    {"psb": 100, "emb1": 50, "emb2": 100, "emb3": 50, "hab1": 100},
    {"psb": 50, "emb1": 50, "emb2": 50, "emb3": 50, "hab1": 100},
    {"psb": 80, "emb1": 50, "emb2": 80, "emb3": 50, "hab1": 100},
    {"psb": 70, "emb1": 50, "emb2": 70, "emb3": 70, "hab1": 100},
    {"psb": 46, "emb1": 10, "emb2": 30, "emb3": 21, "hab1": 17},
    {"psb": -10, "emb1": -10, "emb2": -10, "emb3": -10, "hab1": -10},
    {"psb": -20, "emb1": -20, "emb2": -20, "emb3": -20, "hab1": -20},
]

MSE_results = []

# loop over noise thresholds permutations
for directory in threshold_permutations:
    # importing data
    electrons_df = pd.read_parquet(
        r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Electron\Parquet\1m_electron_pq_3"
    )
    electrons = Experiment(electrons_df, config=FIVE_LAYERS)
    photons_df = pd.read_parquet(
        r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\Photon\Parquet\1m_photon_pq"
    )
    photons = Experiment(photons_df, config=FIVE_LAYERS)
    pi0_df = pd.read_parquet(
        r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiZero\Parquet\pq_pi0_2"
    )
    pi0 = Experiment(pi0_df, config=FIVE_LAYERS)
    pi_charged_df = pd.read_parquet(
        r"C:\Users\User1\Desktop\MSci_Project\Data\6_data\PiPlusMinus\Parquet\pq_piplusminus_2"
    )
    pi_char = Experiment(pi_charged_df, config=FIVE_LAYERS)

    # set noises to new value, then proceed to denoisify and
    # train model with those new values
    electrons.set_noise_thresholds(directory)
    photons.set_noise_thresholds(directory)
    pi0.set_noise_thresholds(directory)
    pi_char.set_noise_thresholds(directory)

    electrons.remove_duplicates()
    electrons.denoisify()
    electrons.shuffle_dataset(repeats=1)
    mask = electrons.tot_layers_et() > 0
    electrons.remove_events(mask=mask)

    photons.remove_duplicates()
    photons.denoisify()
    photons.shuffle_dataset(repeats=1)
    mask = photons.tot_layers_et() > 0
    photons.remove_events(mask=mask)

    pi0.remove_duplicates()
    pi0.denoisify()
    pi0.shuffle_dataset(repeats=1)
    mask = pi0.tot_layers_et() > 0
    pi0.remove_events(mask=mask)

    pi_char.remove_duplicates()
    pi_char.denoisify()
    pi_char.shuffle_dataset(repeats=1)
    mask = pi_char.tot_layers_et() > 0
    pi_char.remove_events(mask=mask)

    # combine and shuffle
    experiment = electrons + photons + pi0 + pi_char
    del electrons
    del photons
    del pi0
    del pi_char
    experiment.shuffle_dataset(repeats=11)

    experiment.train_test_split(get_X=get_X_5, get_Y=get_Y_1, test_size=0.2)
    current_mse = experiment.train_xgboost_model(params)

    MSE_results.append({**directory, "Mean Squared Error": current_mse})
    del experiment


df = pd.DataFrame(MSE_results)

df  # 3640.2703203573874

INFO: Trained XGBoost model; mean squared error: 3640.2703203573874
INFO: Trained XGBoost model; mean squared error: 3646.260860731691
INFO: Trained XGBoost model; mean squared error: 3696.036006763415
INFO: Trained XGBoost model; mean squared error: 3714.6482954978474
INFO: Trained XGBoost model; mean squared error: 3714.5872337896135
INFO: Trained XGBoost model; mean squared error: 3744.618811289878
INFO: Trained XGBoost model; mean squared error: 3788.693036106011
INFO: Trained XGBoost model; mean squared error: 3829.029310177168
INFO: Trained XGBoost model; mean squared error: 3847.734493575052
INFO: Trained XGBoost model; mean squared error: 3742.001768528324
INFO: Trained XGBoost model; mean squared error: 3713.885751239036
INFO: Trained XGBoost model; mean squared error: 3716.322014148381
INFO: Trained XGBoost model; mean squared error: 3714.1205303163015
INFO: Trained XGBoost model; mean squared error: 3649.1019274940604
INFO: Trained XGBoost model; mean squared error: 3650.568

Unnamed: 0,psb,emb1,emb2,emb3,hab1,Mean Squared Error
0,0,0,0,0,0,3640.27032
1,20,20,20,20,20,3646.260861
2,40,40,40,40,40,3696.036007
3,60,60,60,60,60,3714.648295
4,80,80,80,80,80,3714.587234
5,100,100,100,100,100,3744.618811
6,120,120,120,120,120,3788.693036
7,140,140,140,140,140,3829.02931
8,160,160,160,160,160,3847.734494
9,100,50,100,50,100,3742.001769


In [7]:
utils.save_table_df(
    df,
    filename="6.0-noise-thresholds.csv",
)