In [2]:
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

In [39]:
# you don't need to unzip the data - pandas can still recognise it
trainp = './Kaggle/training.csv.zip'
testp = './Kaggle/test.csv.zip'

traindataset = pd.read_csv(trainp)
testdataset = pd.read_csv(testp)

features = traindataset.drop(columns=['signal', 'min_ANNmuon', 'mass', 'production'])
labels = traindataset['signal']

In [27]:
gnb = LogisticRegression()
model = gnb.fit(features, labels)  

# you don't need to split up into train/test sets because
# cross-validation does it for you
scores = cross_val_score(gnb, features, labels, scoring='accuracy', cv = 5)

print('Accuracy = {}% +/- {}'.format(round(100*np.mean(scores), 3), round(100*np.std(scores), 3)))

Accuracy = 61.834% +/- 0.064


### Try all the models

In [31]:
models = [LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier(),
          LinearDiscriminantAnalysis(), GaussianNB()]

for m in models:
    model = m.fit(features, labels)
    scores = cross_val_score(gnb, features, labels, scoring='accuracy', cv = 5)
    print('{}: Accuracy = {}% +/- {}'.format(m.__class__.__name__, 
                                             round(100*np.mean(scores), 3),
                                             round(100*np.std(scores), 3)))

LogisticRegression: Accuracy = 61.826% +/- 0.075
DecisionTreeClassifier: Accuracy = 61.794% +/- 0.061
KNeighborsClassifier: Accuracy = 61.824% +/- 0.073




LinearDiscriminantAnalysis: Accuracy = 61.817% +/- 0.079
GaussianNB: Accuracy = 61.848% +/- 0.086


#### you will probably need to do some hyper-parameter tuning to improve performance

### Use the Gaussian Naive Bayes model to predict on the test set

In [45]:
model = GaussianNB().fit(features, labels)
# output probabilities not raw labels
model.predict_proba(testdataset)[:, 0]

array([0.71196485, 0.50571252, 0.29329722, ..., 0.5780373 , 0.40262902,
       0.37115184])