In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
data = pd.read_csv('HIGGS_train.csv', header=None)

# Drop rows with non-float values
data = data.apply(pd.to_numeric, errors='coerce')
data = data.dropna()

# Split the data into features (X) and target variable (y)
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the logistic regression model and perform 5-fold-cross-validation
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

# Calculate the performance metrics using cross-validation
acc_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
precision_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='precision')
recall_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='recall')
f1_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')

print('Mean cross-validation accuracy:', np.mean(acc_scores))
print("Mean Cross-validation precision:", np.mean(precision_scores))
print("Mean Cross-validation recall:", np.mean(recall_scores))
print("Mean Cross-validation F1-score:", np.mean(f1_scores))

# Test the logistic regression model on the testing set
y_pred = clf.predict(X_test)

# Calculate the performance metrics on the testing set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy on testing set:", accuracy)
print("Precision on testing set:", precision)
print("Recall on testing set:", recall)
print("F1-score on testing set:", f1)
print("Confusion matrix on testing set:\n", conf_matrix)


  data = pd.read_csv('HIGGS_train.csv', header=None)


Mean cross-validation accuracy: 0.6409449268351752
Mean Cross-validation precision: 0.6379443865791645
Mean Cross-validation recall: 0.7412748541064336
Mean Cross-validation F1-score: 0.6857382646290245
Accuracy on testing set: 0.6432416666666667
Precision on testing set: 0.6434107473077342
Recall on testing set: 0.7404584930756591
F1-score on testing set: 0.6885317463204533
Confusion matrix on testing set:
 [[29870 26225]
 [16586 47319]]


In [3]:
# save the model using pickle 
import pickle
pickle.dump(clf, open('model.pkl','wb'))

# load the model 
model = pickle.load(open('model.pkl','rb'))

# predict the output on the HIGGS_train.csv
data = pd.read_csv('HIGGS_train.csv', header=None)
data = data.apply(pd.to_numeric, errors='coerce')
data = data.dropna()
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values
y_pred = model.predict(X)

# calculate the performance metrics on the HIGGS_train.csv
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
conf_matrix = confusion_matrix(y, y_pred)

print("Accuracy on HIGGS_train.csv:", accuracy)
print("Precision on HIGGS_train.csv:", precision)
print("Recall on HIGGS_train.csv:", recall)
print("F1-score on HIGGS_train.csv:", f1)
print("Confusion matrix on HIGGS_train.csv:\n", conf_matrix)

  data = pd.read_csv('HIGGS_train.csv', header=None)


Accuracy on HIGGS_train.csv: 0.5411436076240508
Precision on HIGGS_train.csv: 0.7138158560527861
Recall on HIGGS_train.csv: 0.22210991031841998
F1-score on HIGGS_train.csv: 0.33879946779639847
Confusion matrix on HIGGS_train.csv:
 [[254149  28279]
 [247033  70535]]
