In [15]:
import pandas as pd
import numpy as np

In [16]:
# Load the dataset
data = pd.read_csv('../HIGGS_train.csv')

#label the data columns with the following names: label, lepton pT, lepton eta, lepton phi, missing energy magnitude, missing energy phi, jet 1 pt, jet 1 eta, jet 1 phi, jet 1 b-tag, jet 2 pt, jet 2 eta, jet 2 phi, jet 2 b-tag, jet 3 pt, jet 3 eta, jet 3 phi, jet 3 b-tag, jet 4 pt, jet 4 eta, jet 4 phi, jet 4 b-tag, m_jj, m_jjj, m_lv, m_jlv, m_bb, m_wbb, m_wwbb
data.columns = ['label', 'lepton pT', 'lepton eta', 'lepton phi', 'missing energy magnitude', 'missing energy phi', 'jet 1 pt', 'jet 1 eta', 'jet 1 phi', 'jet 1 b-tag', 'jet 2 pt', 'jet 2 eta', 'jet 2 phi', 'jet 2 b-tag', 'jet 3 pt', 'jet 3 eta', 'jet 3 phi', 'jet 3 b-tag', 'jet 4 pt', 'jet 4 eta', 'jet 4 phi', 'jet 4 b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

# Print the first five rows of the data
print(data.head())

   label  lepton pT  lepton eta  lepton phi  missing energy magnitude  \
0    1.0      0.908       0.329     0.35900                     1.500   
1    1.0      0.799       1.470    -1.64000                     0.454   
2    0.0      1.340      -0.877     0.93600                     1.990   
3    1.0      1.110       0.321     1.52000                     0.883   
4    0.0      1.600      -0.608     0.00707                     1.820   

   missing energy phi  jet 1 pt  jet 1 eta jet 1 phi  jet 1 b-tag  ...  \
0              -0.313     1.100     -0.558     -1.59         2.17  ...   
1               0.426     1.100      1.280      1.38         0.00  ...   
2               0.882     1.790     -1.650    -0.942         0.00  ...   
3              -1.210     0.681     -1.070    -0.922         0.00  ...   
4              -0.112     0.848     -0.566      1.58         2.17  ...   

   jet 4 eta  jet 4 phi  jet 4 b-tag   m_jj  m_jjj   m_lv  m_jlv   m_bb  \
0     -1.140  -0.000819          0.0  0.3

  data = pd.read_csv('../HIGGS_train.csv')


In [17]:
# Find columns with non-float data types
non_float_columns = data.select_dtypes(exclude=['float']).columns.tolist()

# Convert non-float columns to numeric data type
for col in non_float_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Find rows with non-float values
nonfloat_rows = data[data.isnull().any(axis=1)]

# Print the row numbers and column names with non-float values
for index, row in nonfloat_rows.iterrows():
    nonfloat_columns = row.index[row.isnull()].tolist()
    print(f"Row {index} contains non-float values in columns: {nonfloat_columns}")

Row 260691 contains non-float values in columns: ['jet 3 b-tag']
Row 261025 contains non-float values in columns: ['jet 1 phi']
Row 490958 contains non-float values in columns: ['jet 4 b-tag']
Row 490959 contains non-float values in columns: ['jet 4 b-tag']


In [18]:
# Delete rows with non-float values
data.drop(nonfloat_rows.index, inplace=True)

In [19]:
#add a column to the dataset that is the sum of the b-tags for the four jets
data['b-tag sum'] = data['jet 1 b-tag'] + data['jet 2 b-tag'] + data['jet 3 b-tag'] + data['jet 4 b-tag']

#add a column to the dataset that is the sum of the pt values for the four jets
data['jet pt sum'] = data['jet 1 pt'] + data['jet 2 pt'] + data['jet 3 pt'] + data['jet 4 pt']

#add a column to the dataset that is the sum of the eta values for the four jets
data['jet eta sum'] = data['jet 1 eta'] + data['jet 2 eta'] + data['jet 3 eta'] + data['jet 4 eta']

#add a column to the dataset that is the sum of the phi values for the four jets
data['jet phi sum'] = data['jet 1 phi'] + data['jet 2 phi'] + data['jet 3 phi'] + data['jet 4 phi']

#add a column to the dataset that is the sum of the pt values for the two leptons
data['lepton pt sum'] = data['lepton pT'] + data['lepton pT']

#add a column to the dataset that is the sum of the eta values for the two leptons
data['lepton eta sum'] = data['lepton eta'] + data['lepton eta']

#add a column to the dataset that is the sum of the phi values for the two leptons
data['lepton phi sum'] = data['lepton phi'] + data['lepton phi']

#add a column containg the sqaure of the m_bb column
data['m_bb^2'] = data['m_bb']**2

print(data.head())

   label  lepton pT  lepton eta  lepton phi  missing energy magnitude  \
0    1.0      0.908       0.329     0.35900                     1.500   
1    1.0      0.799       1.470    -1.64000                     0.454   
2    0.0      1.340      -0.877     0.93600                     1.990   
3    1.0      1.110       0.321     1.52000                     0.883   
4    0.0      1.600      -0.608     0.00707                     1.820   

   missing energy phi  jet 1 pt  jet 1 eta  jet 1 phi  jet 1 b-tag  ...  \
0              -0.313     1.100     -0.558     -1.590         2.17  ...   
1               0.426     1.100      1.280      1.380         0.00  ...   
2               0.882     1.790     -1.650     -0.942         0.00  ...   
3              -1.210     0.681     -1.070     -0.922         0.00  ...   
4              -0.112     0.848     -0.566      1.580         2.17  ...   

   m_wbb  m_wwbb  b-tag sum  jet pt sum  jet eta sum  jet phi sum  \
0  0.992   0.798       4.38       2.812  

In [20]:
# Calculate z-scores for each column
z_scores = np.abs((data - data.mean()) / data.std())

# Define threshold for z-scores
z_threshold = 2

# Remove rows with z-score greater than threshold in any column
outlier_rows = data[(z_scores > z_threshold).any(axis=1)]
data = data[(z_scores <= z_threshold).all(axis=1)]

# Print number of outlier rows removed
print(f"{len(outlier_rows)} outlier rows removed.")

289558 outlier rows removed.


In [21]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into features and target variable
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

#shuffle the data
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



# Create the logistic regression model
logistic_regression_model = LogisticRegression()

# Fit the model on the training data
logistic_regression_model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = logistic_regression_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.6621462010909247


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
from sklearn.model_selection import KFold

# apply k-fold cross validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = []

for train_indices, test_indices in k_fold.split(X):
    # split data into training and testing sets
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    # create and train the model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # evaluate the model on the testing set
    score = model.score(X_test, y_test)
    scores.append(score)

# print the accuracy of each fold
print(scores)

# print the average accuracy of all 10 folds
print(np.mean(scores))


[0.663574281664734, 0.661899239788687, 0.663606494008504, 0.6642185285401366, 0.6666666666666666, 0.6656358716660224, 0.6671176394794486, 0.6592146377605257, 0.6660116612440808, 0.6652385400895532]
0.6643183560908359


In [23]:
#perform regularized logistic regression on different values of C
from sklearn.linear_model import LogisticRegression

# create and train the model
model = LogisticRegression(C=1)
model.fit(X_train, y_train)

# print the accuracy for the model
print(model.score(X_test, y_test))

0.6641754985020778


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
#perform xgboost on the dataset
import xgboost as xgb

# create the DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)

# create the parameter dictionary
params = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}

# train the model
num_rounds = 2
bst = xgb.train(params, dtrain, num_rounds)

# create the DMatrix for the testing data
dtest = xgb.DMatrix(X_test)

# make predictions
predictions = bst.predict(dtest)

# print the first 4 predictions
print(predictions[:4])

# print the first 4 actual labels
print(y_test[:4])

# print the accuracy
accuracy = accuracy_score(y_test, predictions.round())
print("Accuracy:", accuracy)

[0.28494623 0.7900954  0.63004136 0.42988688]
416164    1.0
405934    0.0
585712    1.0
52731     0.0
Name: label, dtype: float64
Accuracy: 0.6616950681313017


In [25]:
#implementing a decision tree
from sklearn.tree import DecisionTreeClassifier

# create the model
model = DecisionTreeClassifier()

# train the model
model.fit(X_train, y_train)

# make predictions
predictions = model.predict(X_test)

# print the accuracy
accuracy = accuracy_score(y_test, predictions)
print("Tree accuracy:", accuracy)

#implementing a random forest
from sklearn.ensemble import RandomForestClassifier

# create the model
model = RandomForestClassifier()

# train the model
model.fit(X_train, y_train)

# make predictions
predictions = model.predict(X_test)

# print the accuracy
accuracy = accuracy_score(y_test, predictions)
print("Forest accuracy:", accuracy)

Tree accuracy: 0.6335727861353606
Forest accuracy: 0.7269271655445673


In [None]:
#implementing a gradient boosted tree
from sklearn.ensemble import GradientBoostingClassifier

# create the model
model = GradientBoostingClassifier()

# train the model
model.fit(X_train, y_train)

# make predictions
predictions = model.predict(X_test)

# print the accuracy
accuracy = accuracy_score(y_test, predictions)
print("Gradient Boosted Tree accuracy:", accuracy)