In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from tensorflow.keras.layers import Dense
from keras import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow as tf


from sklearn.metrics import confusion_matrix,classification_report
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("HIGGS_train.csv",header=None, names=["Prediction","lepton pT", "lepton eta", "lepton phi", "missing energy magnitude", "missing energy phi", "jet 1 pt", "jet 1 eta", "jet 1 phi", "jet 1 b-tag", "jet 2 pt", "jet 2 eta", "jet 2 phi","jet 2 b-tag", "jet 3 pt", "jet 3 eta", "jet 3 phi", "jet 3 b-tag", "jet 4 pt", "jet 4 eta", "jet 4 phi", "jet 4 b-tag", "m jj","m jjj", "m lv", "m jlv", "m bb", "m wbb", "m wwbb"])

print(data)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


        Prediction  lepton pT  lepton eta  lepton phi  \
0              1.0      0.869      -0.635       0.226   
1              1.0      0.908       0.329       0.359   
2              1.0      0.799       1.470      -1.640   
3              0.0      1.340      -0.877       0.936   
4              1.0      1.110       0.321       1.520   
...            ...        ...         ...         ...   
599995         0.0      0.680       0.223      -0.757   
599996         1.0      1.610      -1.620       0.212   
599997         1.0      1.070       0.364       0.344   
599998         1.0      1.180      -0.173      -1.460   
599999         0.0      0.771      -0.133      -1.020   

        missing energy magnitude  missing energy phi  jet 1 pt  jet 1 eta  \
0                          0.327              -0.690     0.754     -0.249   
1                          1.500              -0.313     1.100     -0.558   
2                          0.454               0.426     1.100      1.280   
3      

In [3]:
#deleting row that contains the missing data 
data = data.dropna()
print(data)

        Prediction  lepton pT  lepton eta  lepton phi  \
0              1.0      0.869      -0.635       0.226   
1              1.0      0.908       0.329       0.359   
2              1.0      0.799       1.470      -1.640   
3              0.0      1.340      -0.877       0.936   
4              1.0      1.110       0.321       1.520   
...            ...        ...         ...         ...   
599995         0.0      0.680       0.223      -0.757   
599996         1.0      1.610      -1.620       0.212   
599997         1.0      1.070       0.364       0.344   
599998         1.0      1.180      -0.173      -1.460   
599999         0.0      0.771      -0.133      -1.020   

        missing energy magnitude  missing energy phi  jet 1 pt  jet 1 eta  \
0                          0.327              -0.690     0.754     -0.249   
1                          1.500              -0.313     1.100     -0.558   
2                          0.454               0.426     1.100      1.280   
3      

In [4]:
# Drop duplicates
data =data.drop_duplicates()

print(data)

        Prediction  lepton pT  lepton eta  lepton phi  \
0              1.0      0.869      -0.635       0.226   
1              1.0      0.908       0.329       0.359   
2              1.0      0.799       1.470      -1.640   
3              0.0      1.340      -0.877       0.936   
4              1.0      1.110       0.321       1.520   
...            ...        ...         ...         ...   
599995         0.0      0.680       0.223      -0.757   
599996         1.0      1.610      -1.620       0.212   
599997         1.0      1.070       0.364       0.344   
599998         1.0      1.180      -0.173      -1.460   
599999         0.0      0.771      -0.133      -1.020   

        missing energy magnitude  missing energy phi  jet 1 pt  jet 1 eta  \
0                          0.327              -0.690     0.754     -0.249   
1                          1.500              -0.313     1.100     -0.558   
2                          0.454               0.426     1.100      1.280   
3      

In [5]:
# Find columns with non-float data types
non_float_columns = data.select_dtypes(exclude=['float']).columns.tolist()

# Convert non-float columns to numeric data type
for col in non_float_columns:
    data.loc[:, col] = pd.to_numeric(data[col], errors='coerce')

# Find rows with non-float values
nonfloat_rows = data[data.isnull().any(axis=1)]

# Print the row numbers and column names with non-float values
for index, row in nonfloat_rows.iterrows():
    nonfloat_columns = row.index[row.isnull()].tolist()
    print(f"Row {index} contains non-float values in columns: {nonfloat_columns}")
    
# Delete rows with non-float values

data.drop(nonfloat_rows.index, inplace=True)

# Print the cleaned data
print(data)

Row 261026 contains non-float values in columns: ['jet 1 phi']
Row 490959 contains non-float values in columns: ['jet 4 b-tag']
Row 490960 contains non-float values in columns: ['jet 4 b-tag']
        Prediction  lepton pT  lepton eta  lepton phi  \
0              1.0      0.869      -0.635       0.226   
1              1.0      0.908       0.329       0.359   
2              1.0      0.799       1.470      -1.640   
3              0.0      1.340      -0.877       0.936   
4              1.0      1.110       0.321       1.520   
...            ...        ...         ...         ...   
599995         0.0      0.680       0.223      -0.757   
599996         1.0      1.610      -1.620       0.212   
599997         1.0      1.070       0.364       0.344   
599998         1.0      1.180      -0.173      -1.460   
599999         0.0      0.771      -0.133      -1.020   

        missing energy magnitude  missing energy phi  jet 1 pt  jet 1 eta  \
0                          0.327             

In [6]:
# Add new features
data['Delta phi jet 1 jet 2'] = abs(data['jet 1 phi'] - data['jet 2 phi'])

# Feature Ratios
data['m_bb_m_wwbb_ratio'] = data['m bb'] / data['m wwbb']
data['m_wbb_m_jlv_ratio'] = data['m wbb'] / data['m jlv']


# Statistical Features
data['m_bb_m_wwbb_std'] = data[['m bb', 'm wwbb', 'm wbb']].std(axis=1)

print(data)

        Prediction  lepton pT  lepton eta  lepton phi  \
0              1.0      0.869      -0.635       0.226   
1              1.0      0.908       0.329       0.359   
2              1.0      0.799       1.470      -1.640   
3              0.0      1.340      -0.877       0.936   
4              1.0      1.110       0.321       1.520   
...            ...        ...         ...         ...   
599995         0.0      0.680       0.223      -0.757   
599996         1.0      1.610      -1.620       0.212   
599997         1.0      1.070       0.364       0.344   
599998         1.0      1.180      -0.173      -1.460   
599999         0.0      0.771      -0.133      -1.020   

        missing energy magnitude  missing energy phi  jet 1 pt  jet 1 eta  \
0                          0.327              -0.690     0.754     -0.249   
1                          1.500              -0.313     1.100     -0.558   
2                          0.454               0.426     1.100      1.280   
3      

In [7]:
# Create a copy of the original data to store augmented data
augmented_data = data.copy()

# Perform bootstrapping to generate augmented data
n_bootstraps = 10  # Number of bootstraps
n_samples = len(data)  # Number of samples in the original dataset

for i in range(n_bootstraps):
    bootstrap_indices = np.random.randint(0, n_samples, size=n_samples)  # Generate random indices for bootstrapping
    bootstrap_data = data.iloc[bootstrap_indices]  # Select data samples based on bootstrap indices
    augmented_data = pd.concat([augmented_data, bootstrap_data], ignore_index=True)  # Concatenate augmented data with original data

print(augmented_data)

         Prediction  lepton pT  lepton eta  lepton phi  \
0               1.0      0.869      -0.635       0.226   
1               1.0      0.908       0.329       0.359   
2               1.0      0.799       1.470      -1.640   
3               0.0      1.340      -0.877       0.936   
4               1.0      1.110       0.321       1.520   
...             ...        ...         ...         ...   
6592933         0.0      0.429      -0.695       1.640   
6592934         0.0      1.370      -1.150       1.570   
6592935         1.0      0.842       0.661       0.936   
6592936         1.0      0.279      -1.890      -1.500   
6592937         0.0      0.503      -1.350      -0.919   

         missing energy magnitude  missing energy phi  jet 1 pt  jet 1 eta  \
0                           0.327              -0.690     0.754     -0.249   
1                           1.500              -0.313     1.100     -0.558   
2                           0.454               0.426     1.100      

In [8]:
# Define a function to remove outliers using IQR for each column separately
def remove_outliers_iqr_per_column(augmented_data, iqr_multiplier=1.5):
    data_no_outliers = augmented_data.copy()  # Create a copy of the data to avoid modifying the original data
    for col in augmented_data.columns:  # Loop through each column
        Q1 = augmented_data[col].quantile(0.15)  # Calculate Q1 (25th percentile) for the column
        Q3 = augmented_data[col].quantile(0.85)  # Calculate Q3 (75th percentile) for the column
        IQR = Q3 - Q1  # Calculate IQR for the column
        lower_bound = Q1 - iqr_multiplier * IQR  # Calculate lower bound for the column
        upper_bound = Q3 + iqr_multiplier * IQR  # Calculate upper bound for the column
        outliers = augmented_data[col][((augmented_data[col] < lower_bound) | (augmented_data[col] > upper_bound))]  # Identify outliers for the column
        data_no_outliers = data_no_outliers.loc[~((augmented_data[col] < lower_bound) | (augmented_data[col] > upper_bound))]  # Remove outliers for the column
    return data_no_outliers

# Remove outliers using IQR for each column separately
data_no_outliers_iqr_per_column = remove_outliers_iqr_per_column(augmented_data)

In [9]:
# extract the features and target variable
X =data_no_outliers_iqr_per_column.iloc[:, 1:].values
y = data_no_outliers_iqr_per_column.iloc[:, 0].values
print(X.shape)
print(y)

(5513692, 32)
[1. 1. 1. ... 1. 1. 0.]


In [10]:
#shuffle the data
X, y = shuffle(X, y, random_state=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier


# Create an XGBClassifier model
model = XGBClassifier(objective='binary:logistic',  # Specify the binary classification task
                      learning_rate=0.01,  # Learning rate
                      max_depth=7,  # Maximum depth of the trees
                      n_estimators=200,  # Number of trees
                      reg_alpha=0.01,  # L1 regularization (alpha)
                      reg_lambda=0.01,  # L2 regularization (lambda)
                      seed=42)  # Random seed for reproducibility


# Perform k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
train_accuracies = []
test_accuracies = []

for train_index, test_index in cv.split(X):
    X_train_scaled, X_test_scaled = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train_scaled, y_train)  # Train the model

    y_train_pred = model.predict(X_train_scaled)  # Make predictions on the training set
    y_test_pred = model.predict(X_test_scaled)  # Make predictions on the testing set

    train_accuracy = accuracy_score(y_train, y_train_pred)  # Calculate accuracy on training set
    test_accuracy = accuracy_score(y_test, y_test_pred)  # Calculate accuracy on testing set

    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

# Calculate and print the mean and standard deviation of training and testing accuracies
print("Mean Training Accuracy:", round(sum(train_accuracies)/len(train_accuracies), 4))
print("Mean Testing Accuracy:", round(sum(test_accuracies)/len(test_accuracies), 4))
print("Standard Deviation of Training Accuracy:", round(np.std(train_accuracies), 4))
print("Standard Deviation of Testing Accuracy:", round(np.std(test_accuracies), 4))

# Make predictions on the testing set
y_test_pred = model.predict(X_test_scaled)

# Calculate confusion matrix on testing set
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)



# Generate the classification report
report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(report)


