In [None]:
# needed librairies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
import xgboost as xgb

**Loading Data**

In [None]:
# grant access to the google drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

# load the dataset located in a folder named "ML" that's in your root Google Drive
data = pd.read_csv('/content/drive/MyDrive/ML/HIGGS_train.csv', dtype={'8': float, '21': float})

# define the headers of the dataset
column_names = ['class_label', 'lepton_pt', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
                'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_btag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_btag',
                'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_btag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_btag',
                'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

# assign the headers to the data
data.columns=column_names

cleaned_data = data.copy()

**Exploring Data**

In [None]:
# check the first few rows of the data
print("The first few rows:\n")
print(data.head())
print("\n")

# retrive the information of the data
print("Data Information:\n")
print(data.info())
print("\n")

# retrive basic statistics about the data
print("Data Statistics:\n")
print(data.describe())
print("\n")

# retrive the shape of the data
print("Data Shape:\n")
print(data.shape)
print("\n")

**Data Cleaning**

In [None]:
# the columns 8 and 21 are of type objects: string
# the values of these columns will be checked, 
# looking for unexpected values that lead to having mixed data types

# column 8
print("Column 8: ")
print(cleaned_data['jet_1_phi'].unique())
# result: 
  # 1. float64 numeric values in string objects
  # 2. float64 numeric values in string objects and stored in a string object

# solution:
cleaned_data['jet_1_phi']=pd.to_numeric(cleaned_data['jet_1_phi'],errors='coerce')
print(cleaned_data['jet_1_phi'].unique().size)

# column 21
print("Column 21: ")
print(cleaned_data['jet_4_btag'].unique())
# result:
  # 1. float64 numeric values in string objects instead of float64 type
  # 2. alphabetical values in string objects

#solution:
cleaned_data['jet_4_btag']=pd.to_numeric(cleaned_data['jet_4_btag'], errors='coerce')
print(cleaned_data['jet_4_btag'].unique().size)


In [None]:
# remove the training examples with NaN values from the dataset
cleaned_data.dropna(inplace=True)
print(cleaned_data.info())


For **regression and decision trees**, we will use the complete set of features (low-level and high-level combined) to take advantage of the manually constructed high-level features.

**test_size** determines the proportion of the data that will be allocated for the testing set. In this case, test_size=0.2 means that **20%** of the data will be used **for testing**, and the remaining **80%** will be used for **training**.

**random_state** is an optional parameter that sets the random seed used by the random number generator. This **ensures** that the **random splitting of the data is reproducible**, meaning that if you run the same code multiple times with the same random_state value, you will get the same split of data into training and testing sets.

**Usage of all Features: Low-Level and High-Level**

In [None]:
# Split the dataset into training and testing sets
X = cleaned_data.iloc[:, 1:]  # Select all columns except the first one as features
y = cleaned_data.iloc[:, 0]   # Select the first column as the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Usage of High-Level Features**

In [None]:
# # Select the last 7 columns as high-features
# X = cleaned_data.iloc[:, 22:]

# # Select the first column as the target variable
# y = cleaned_data.iloc[:, 0]

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Linear Regression Model**

In [None]:
# Create a linear regression model and fit it to the training set
model = LinearRegression()
model.fit(X_train, y_train)

# Use the model to make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error: {:.2f}'.format(mse))

**Logistic Regression**

In [None]:
# Create a logistic regression model and fit it to the training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Use the model to make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy_logistic = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy_logistic))

**Decision Trees**

In [None]:
# Create a decision tree classifier and fit it to the training set
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the classifier's accuracy on the test set
accuracy_decisionTrees = clf.score(X_test, y_test)
print('Accuracy: {:.2f}'.format(accuracy_decisionTrees))

**XGBoost**

**n_estimators:** This parameter sets the number of trees to be built in the ensemble. Increasing the number of trees can improve the model's accuracy, but it can also make the model slower to train and predict.

**max_depth:** This parameter sets the maximum depth of each tree in the ensemble. Increasing the maximum depth can improve the model's accuracy, but it can also make the model more prone to overfitting.

**learning_rate:** This parameter sets the step size shrinkage used to prevent overfitting. Lower values of learning rate can reduce the model's accuracy, but it can also make the model more stable and less prone to overfitting.

In [None]:
# Define the XGB model
xgb_model = xgb.XGBClassifier(n_estimators=150, max_depth=7, learning_rate=0.5)

# Train the model on the training data
xgb_model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = xgb_model.predict(X_test)

# Evaluate the model's accuracy
accuracy_xgb = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy_xgb))

**Neural Network**

According to the paper, for the *Higgs boson benchmarks*, we can use a neural network to classify the data by training it on a set of input features and corresponding labels. For the Higgs boson benchmark, we can use *either* the **low-level features**, the **high-level features**, **or the complete set of features** (low-level and high-level combined) as **inputs** to the neural network.

The paper mentions that standard techniques in high-energy physics data analyses include **feed-forward neural networks with a single hidden layer**, which is an example of a *traditional shallow network*.

However, recent advances in deep-learning techniques may lift the limitations of shallow networks by automatically discovering powerful nonlinear feature combinations and providing better discrimination power than current classifiers.

To train a neural network, we need to choose its architecture (number of layers and units per layer), activation function, and other hyperparameters such as learning rate.

**The Rectified Linear Unit (ReLU)** activation function is a commonly used activation function in neural networks. It is a simple function that returns the input if it is positive, and zero otherwise. In mathematical notation, the ReLU function is defined as:

f(x) = max(0, x)

where x is the input to the function and f(x) is the output.

The ReLU activation function is known for its **simplicity and effectiveness** in training deep neural networks. It has been shown to **improve the speed and accuracy of the training process**, as well as help to prevent the vanishing gradient problem that can occur with other activation functions such as the sigmoid function.

**Momentum** is a technique used in the *optimization of gradient descent* algorithms to *accelerate the convergence* of the training process.
**Momentum** adds a fraction of the previous update vector to the current update vector, which helps to smooth the optimization trajectory and avoid getting stuck in local minima.

**Validation split**

**SGD** is an optimization algorithm used to minimize the loss function during training. In contrast to batch gradient descent, which computes the gradient of the loss function using the entire dataset, SGD updates the model parameters using only a small subset of the data at a time. This makes it computationally efficient and able to handle large datasets

In [None]:
batch = 32
act = 'relu'
neurons = 90
nb_epoch = 50
# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation=act, input_dim=X_train.shape[1], kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.1)),
    tf.keras.layers.Dense(75, activation=act, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05)),
    tf.keras.layers.Dense(50, activation=act, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05)),
    tf.keras.layers.Dense(25, activation=act, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05)),
    tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.001))
])

# Compile the model with specified hyperparameters
opt = tf.keras.optimizers.SGD(learning_rate=0.003, momentum=0.9)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

# create a callback to save the model with the highest accuracy
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max')

# Train the model and evaluate on a validation set
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch, epochs=nb_epoch, verbose=1, callbacks=[checkpoint], validation_split=0.0)

# load the weights of the best model
model.load_weights('best_model.h5')

# Evaluate the model on the test set
y_pred = model.predict(X_test)
y_pred = [1 if y >= 0.5 else 0 for y in y_pred]  # convert probabilities to binary predictions

# calculate the accuracy 
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}'.format(accuracy))