# Supervised Machine Learning Solution

In [11]:
# import the necessary libraries

import os 
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [12]:
# import sklearn models

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, roc_auc_score, plot_confusion_matrix,
                             precision_recall_curve, roc_curve)

In [13]:
# import imblearn to deal with unbalanced classes
from imblearn.ensemble import BalancedBaggingClassifier

In [14]:
warnings.filterwarnings("ignore")

## Import the Dataset

In [15]:
data = pd.read_csv('new_physics_signal.csv')

In [16]:
# print first 5 rows to have a brief look on the dataset
data.head()

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,Class
0,78.482022,52.871985,-64.917465,-78.448475,-57.987246,-92.520585,27.806597,46.774201,-57.51291,22.899098,-38.790567,-83.959697,11.666977,846.710033,0
1,78.379086,55.05818,-68.153619,-82.693368,-59.561586,-93.335494,19.354007,47.128549,-60.470478,23.100267,-36.209469,-84.888323,11.836796,25.025335,0
2,79.609922,55.395154,-61.506139,-99.031856,-56.643424,-91.474574,32.361097,48.48081,-63.011139,20.523559,-38.380731,-85.190411,11.483291,2127.582986,0
3,80.592326,55.085738,-73.077999,-91.084996,-60.24416,-92.874068,25.292852,46.640591,-66.546879,25.600912,-39.059436,-84.421174,12.058619,700.637724,0
4,75.686606,56.323331,-69.943208,-75.363929,-57.215389,-92.485721,32.60641,46.729047,-56.972408,21.156952,-33.125372,-83.400313,12.788145,401.390816,0


## Data Preprocessing

In [17]:
# check null values
data.isnull().sum()

p1       5
p2       5
p3       5
p4       5
p5       5
p6       5
p7       5
p8       5
p9       5
p10      5
p11      5
p12      5
p13      5
p14      5
Class    0
dtype: int64

In [18]:
# remove the values containing null
data = data.dropna()
data.isnull().sum()

p1       0
p2       0
p3       0
p4       0
p5       0
p6       0
p7       0
p8       0
p9       0
p10      0
p11      0
p12      0
p13      0
p14      0
Class    0
dtype: int64

In [19]:
# count classes present in the dataset
data['Class'].value_counts()

0    284245
1       492
Name: Class, dtype: int64

In [20]:
"""
As we can see that there are 492 datapoints with class value 1 and other 284245 is 0, which is unbalanced
and makes the model a bit overfitted and biased for class 0

Balancing the unbalanced data by randomly sampling the class with more datapoints. We took 600 random sampled 
datapoints from class 0 and keeping all the 423 number of datapoints for class 1. So that it is kept balanced. 
(however, I did't use the same technique for the previous notebook, in order to see the model performance)

This step is to deal with the bias-variance trade-off.
"""
shuffled = data.sample(frac=1, random_state=4)
data_zero = shuffled.loc[shuffled['Class'] == 0].sample(n=600, random_state = 123)
data_one = shuffled.loc[shuffled['Class'] == 1]

data_sampled_concated = pd.concat([data_zero, data_one]).sample(frac=1, random_state=4)
data_sampled_concated = data_sampled_concated.reset_index(drop=True)
data = data_sampled_concated

In [21]:
"""
x = Taking all the rows and columns (not the last)
y = Taking all the rows but just the last column
"""

data_x = data.iloc[:, :-1]
data_y = data.iloc[:, -1:]

## Scaling and splitting the dataset into the training set and test set

In [None]:
data = pd.read_csv('new_physics_signal.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['Class'].value_counts()

In [None]:
data.isnull().sum() #check null values

In [None]:
# remove null values
data = data.dropna()
data.isnull().sum()

In [None]:
data_x = data.iloc[:, :-1]
data_y = data.iloc[:, -1:]

In [None]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(x)

In [None]:
print(y)

# Take care of missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 0:14])
x[:, 0:14] = imputer.transform(x[:, 0:14])

In [None]:
print(x)

# Splitting the dataset into the training set and test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
print(x_train)

In [None]:
print(x_test)

In [None]:
print(y_train)

In [None]:
print(y_test)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 14:] = sc.fit_transform(x_train[:, 14:])
x_test[:, 14:] = sc.transform(x_test[:, 14:])

In [None]:
print(x_train)

In [None]:
print(x_test)