In [1]:
''' Train GaussianNB model on HIGGS classification problem and submit to competition.
Author: Peter Sadowski
Date: Feb 12 2024
'''
import ast
import numpy as np 
import matplotlib.pylab as plt 
import sklearn 
import sklearn.metrics 

from joblib import dump
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, ParameterGrid
from sklearn.naive_bayes import GaussianNB 
import pandas as pd

pd.set_option("display.max_colwidth", None)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = np.loadtxt('/kaggle/input/higgs-spring-2024/train.csv', skiprows=1, delimiter=',')

# Any results you write to the current directory are saved as output.

/kaggle/input/higgs-spring-2024/sample_submission.csv.zip
/kaggle/input/higgs-spring-2024/test.csv
/kaggle/input/higgs-spring-2024/train.csv.zip
/kaggle/input/higgs-spring-2024/sample_submission.csv
/kaggle/input/higgs-spring-2024/train.csv
/kaggle/input/higgs-spring-2024/test.csv.zip


# Train and Assess Model

In [2]:
# Split off validation set for testing.
Xtrain = data[:40000, 1:]
Ytrain = data[:40000, 0:1]
Xvalid = data[40000:, 1:]
Yvalid = data[40000:, 0:1]

In [3]:
# Fit model to train.
var_smoothing_range = np.logspace(-10, -7, 20)
AUROC = []

var_smoothing = var_smoothing_range[0]
model = GaussianNB(var_smoothing=var_smoothing)
model.fit(Xtrain, Ytrain.flatten())

# parameters = {
#     "n_estimators": [100, 200],
#     "max_depth": [None, 10, 20],
#     "min_samples_split": [2,5],
#     "min_samples_leaf": [1,2]
# }
parameters = {
    "n_estimators": [1,2,3,4,5,6,7,8,10,100,200,300,400,500,600,650,675,700,715,725,750,800],  # Only one option
    "max_depth": [1,2,3,4,5,6,10,20,23,24,25,26,27,28,30, None],    # Only one option
    "min_samples_split": [2,3,4,5,7,10],  # Only one option
    "min_samples_leaf": [1,2,3],  # Only one option
    "n_jobs": [-1],
    "random_state": [42]
}

rf = RandomForestClassifier(random_state=42)
rf.fit(Xtrain, Ytrain.flatten())

print("searching")
def grid_search(model, parameters, cv=5, scoring="roc_auc", results_path="grid_search_results.csv"):
    results_df = None
    if os.path.exists(results_path):
        results_df = pd.read_csv(results_path)
    else:
        results_df = pd.DataFrame(columns=["params", "score"])
    N = len(list(ParameterGrid(parameters)))
    c = 0
    for params in ParameterGrid(parameters):
        c += 1
        if not any(results_df["params"] == str(params)):
            print(f"Evaluating: {c:5} / {N}")
            model.set_params(**params)
            model.fit(Xtrain, Ytrain.flatten())
            predictions = model.predict_proba(Xvalid)
            auroc = sklearn.metrics.roc_auc_score(Yvalid[:,0], predictions[:,1])
            row = pd.DataFrame({"params": [str(params)], "score": [auroc]})
            results_df = pd.concat([results_df, row], ignore_index=True)
            results_df.to_csv(results_path, index=False)
        elif (c % 15 == 0) or (c == N):
            print(f"Skipping: {c:5} / {N:5}")
    results_df.sort_values(by="score", ascending=False)
    results_df.to_csv(results_path, index=False)
    return results_df
results_df = grid_search(rf, parameters, cv=5, scoring="roc_auc", results_path="grid_search_results.csv")
i = results_df["score"].idxmax()
row = results_df.loc[i]
params_string = row["params"]
params = ast.literal_eval(params_string)
print("Results shape:", results_df.shape)
print(params, row["score"])

rf.set_params(**params)
rf.fit(Xtrain, Ytrain.flatten())
      
ensemble = VotingClassifer(estimators=[('nb', nb), ("rf", rf)], voting="soft")
ensemble.fit(X_train, y_train)

#     grid_search = GridSearchCV(RandomForestClassifier(), parameters, cv=5, scoring="roc_auc", verbose=2)
#     grid_search.fit(Xtrain, Ytrain.flatten())
#     dump(grid_search, "grid_search_results.joblib")

# Make hard predictions.
hard_predictions = model.predict(Xvalid)

# Make probabilistic predictions.
predictions = model.predict_proba(Xvalid)
rf_predictions = rf.predict_proba(Xvalid)
predictions_ensemble = ensemble.predict_proba(Xvalid)

# Compute AUROC.
val = sklearn.metrics.roc_auc_score(Yvalid[:,0], predictions[:,1])
rf_auroc = sklearn.metrics.roc_auc_score(Yvalid[:,0], rf_predictions[:,1])
auroc_ensemble = sklearn.metrics.roc_auc_score(Yvalid[:,0], rf_predictions[:,1])
print(f'Validation AUROC: {val}, {rf_auroc}, {auroc_ensemble} ' )

# Plot ROC curve.
fpr, tpr, thresholds = sklearn.metrics.roc_curve(Yvalid[:,0], predictions[:,1])
fpr_rf, tpr_rf, thesholds_rf = sklearn.metrics.roc_curve(Yvalid[:,0], rf_predictions[:,1])
fpr_ensemble, tpr_ensemble, thresholds_ensemble = sklearn.metrics.roc_curve(Yvalid[:,0], predictions_ensemble[:,1])
plt.plot(fpr, tpr, label="Naive Bayes")
plt.plot(fpr_rf, tpr_rf, "Random Forest")
plt.plot(fpr_ensemble, tpr_ensemble, "Ensemble")
plt.plot([0, 1], [0, 1], color='k', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(frameon=False, fontsize="small", title="Legend Title")

searching
Skipping:   15 /  6336
Evaluating: 19 / 6336
Skipping:   30 /  6336
Evaluating: 41 / 6336
Evaluating: 45 / 6336
Evaluating: 46 / 6336
Evaluating: 47 / 6336
Evaluating: 48 / 6336
Evaluating: 49 / 6336
Evaluating: 50 / 6336
Evaluating: 51 / 6336
Evaluating: 52 / 6336
Evaluating: 53 / 6336
Evaluating: 54 / 6336
Evaluating: 55 / 6336
Evaluating: 56 / 6336
Evaluating: 57 / 6336
Evaluating: 58 / 6336
Evaluating: 59 / 6336
Evaluating: 60 / 6336
Evaluating: 61 / 6336
Evaluating: 62 / 6336
Evaluating: 63 / 6336
Evaluating: 64 / 6336
Evaluating: 65 / 6336
Evaluating: 66 / 6336
Skipping:   75 /  6336
Evaluating: 85 / 6336
Skipping:   90 /  6336
Skipping:  105 /  6336
Evaluating: 107 / 6336
Skipping:  120 /  6336
Evaluating: 129 / 6336
Skipping:  135 /  6336
Skipping:  150 /  6336
Evaluating: 151 / 6336
Skipping:  165 /  6336
Evaluating: 173 / 6336
Evaluating: 177 / 6336
Evaluating: 178 / 6336
Evaluating: 179 / 6336
Evaluating: 180 / 6336
Evaluating: 181 / 6336
Evaluating: 182 / 6336
Eva

KeyboardInterrupt: 

# Make Predictions on Test Set

In [None]:
results_df = results_df.sort_values(by="score", ascending=False)
print(results_df)

# M = model
M = ensemble
fn = "ensemble.joblib"
dump(model, fn)
M = model_loaded = load(fn)

# Make probabilistic predictions.
Xtest1 = np.loadtxt('/kaggle/input/higgs-spring-2024/test.csv', skiprows=1, delimiter=',')
predictions = M.predict_proba(Xtest1)
predictions = predictions[:,1:2] # Probability that label=1
N = predictions.shape[0]
assert N == 50000, "Predictions should have lenght 50000."
submission = np.hstack((np.arange(N).reshape(-1,1), predictions)) # Add Id column.
np.savetxt(fname='submission.csv', X=submission, header='Id,Predicted', delimiter=',', comments='')

# Submission can be downloaded from this Kaggle Notebook under Sessions->Data->output->/kaggle/working.