In [29]:
# needed librairies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, precision_recall_curve
from sklearn.metrics import mean_squared_error


**Loading Data**

In [30]:
data = pd.read_csv('HIGGS_train.csv', dtype={'8': float, '21': float})

# define the headers of the dataset
column_names = ['class_label', 'lepton_pt', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
                'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_btag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_btag',
                'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_btag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_btag',
                'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

# assign the headers to the data
data.columns=column_names

cleaned_data = data.copy();

  data = pd.read_csv('HIGGS_train.csv', dtype={'8': float, '21': float})


**Exploring Data**

In [31]:
# check the first few rows of the data
print("The first few rows:\n")
print(data.head())
print("\n")

# retrive the information of the data
print("Data Information:\n")
print(data.info())
print("\n")

# retrive basic statistics about the data
print("Data Statistics:\n")
print(data.describe())
print("\n")

# retrive the shape of the data
print("Data Shape:\n")
print(data.shape)
print("\n")

The first few rows:

   class_label  lepton_pt  lepton_eta  lepton_phi  missing_energy_magnitude   
0          1.0      0.908       0.329     0.35900                     1.500  \
1          1.0      0.799       1.470    -1.64000                     0.454   
2          0.0      1.340      -0.877     0.93600                     1.990   
3          1.0      1.110       0.321     1.52000                     0.883   
4          0.0      1.600      -0.608     0.00707                     1.820   

   missing_energy_phi  jet_1_pt  jet_1_eta jet_1_phi  jet_1_btag  ...   
0              -0.313     1.100     -0.558     -1.59        2.17  ...  \
1               0.426     1.100      1.280      1.38        0.00  ...   
2               0.882     1.790     -1.650    -0.942        0.00  ...   
3              -1.210     0.681     -1.070    -0.922        0.00  ...   
4              -0.112     0.848     -0.566      1.58        2.17  ...   

   jet_4_eta  jet_4_phi  jet_4_btag   m_jj  m_jjj   m_lv  m_jlv  

In [32]:
# the columns 8 and 21 are of type objects: string
# the values of these columns will be checked, 
# looking for unexpected values that lead to having mixed data types

# column 8
print("Column 8: ")
print(cleaned_data['jet_1_phi'].unique())
# result: 
  # 1. float64 numeric values in string objects
  # 2. float64 numeric values in string objects and stored in a string object

# solution:
cleaned_data['jet_1_phi']=pd.to_numeric(cleaned_data['jet_1_phi'],errors='coerce')
print(cleaned_data['jet_1_phi'].unique().size)

# column 21
print("Column 21: ")
print(cleaned_data['jet_4_btag'].unique())
# result:
  # 1. float64 numeric values in string objects instead of float64 type
  # 2. alphabetical values in string objects

#solution:
cleaned_data['jet_4_btag']=pd.to_numeric(cleaned_data['jet_4_btag'], errors='coerce')
print(cleaned_data['jet_4_btag'].unique().size)


Column 8: 
[-1.59 1.38 -0.942 ... '3.50E-01' '-6.59E-03' '"1.01"']
2312
Column 21: 
[0.0 3.1 1.55 '0.00E+00' '3.10E+00' '1.55E+00' 'error' 's']
4


**Data Cleaning**

In [33]:
# remove the training examples with NaN values from the dataset
cleaned_data.dropna(inplace=True)

# testing the realtime

# the cleaned dataset will be saved to a new CSV file
cleaned_data.to_csv('HIGGS_train_cleaned.csv', index=False)

For **regression and decision trees**, we will use the complete set of features (low-level and high-level combined) to take advantage of the manually constructed high-level features.

For **neural networks**, since we will be using a deep-learning technique, we will use either the low-level features or the complete set of features.

According to the paper, deep-learning techniques show nearly equivalent performance using these two feature sets and can automatically discover the insight contained in the high-level features even if you only use the low-level features.

**Usage of the complete set of features**

**test_size** determines the proportion of the data that will be allocated for the testing set. In this case, test_size=0.2 means that **20%** of the data will be used **for testing**, and the remaining **80%** will be used for **training**.

**random_state** is an optional parameter that sets the random seed used by the random number generator. This **ensures** that the **random splitting of the data is reproducible**, meaning that if you run the same code multiple times with the same random_state value, you will get the same split of data into training and testing sets.

In [None]:
# Split the dataset into training and testing sets
X = cleaned_data.iloc[:, 1:]  # Select all columns except the first one as features
y = cleaned_data.iloc[:, 0]   # Select the first column as the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Linear Regression Model**

In [None]:
# Create a linear regression model and fit it to the training set
model = LinearRegression()
model.fit(X_train, y_train)

# Use the model to make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error: {:.2f}'.format(mse))

**Logistic Regression**

In [None]:
# Create a logistic regression model and fit it to the training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Use the model to make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy_logistic = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy_logistic))

**Decision Trees**

In [None]:
# Create a decision tree classifier and fit it to the training set
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the classifier's accuracy on the test set
accuracy_decisionTrees = clf.score(X_test, y_test)
print('Accuracy: {:.2f}'.format(accuracy_decisionTrees))