# Data Preprocessing

In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import the dataset

In [27]:
dataset = pd.read_csv('new_physics_signal.csv')

In [28]:
dataset

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,Class
0,78.482022,52.871985,-64.917465,-78.448475,-57.987246,-92.520585,27.806597,46.774201,-57.512910,22.899098,-38.790567,-83.959697,11.666977,846.710033,0
1,78.379086,55.058180,-68.153619,-82.693368,-59.561586,-93.335494,19.354007,47.128549,-60.470478,23.100267,-36.209469,-84.888323,11.836796,25.025335,0
2,79.609922,55.395154,-61.506139,-99.031856,-56.643424,-91.474574,32.361097,48.480810,-63.011139,20.523559,-38.380731,-85.190411,11.483291,2127.582986,0
3,80.592326,55.085738,-73.077999,-91.084996,-60.244160,-92.874068,25.292852,46.640591,-66.546879,25.600912,-39.059436,-84.421174,12.058619,700.637724,0
4,75.686606,56.323331,-69.943208,-75.363929,-57.215389,-92.485721,32.606410,46.729047,-56.972408,21.156952,-33.125372,-83.400313,12.788145,401.390816,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,133.042906,53.318139,-74.216965,-86.840671,-51.972651,-91.610258,26.275889,48.656623,-61.702890,29.711831,-35.192303,-78.682124,15.676796,14.288014,0
284803,79.967216,56.129745,-66.096096,-93.501201,-58.929389,-91.607308,33.769425,46.979915,-65.388282,19.070774,-40.479622,-84.383717,11.512836,148.616377,0
284804,83.098159,54.064683,-65.457304,-86.024796,-59.215377,-91.537236,30.576984,46.896309,-53.345257,23.613591,-37.956903,-84.800778,11.640835,389.590948,0
284805,82.876545,52.797130,-62.995925,-59.193084,-58.596254,-91.406831,32.622736,46.685808,-57.103731,19.265872,-32.761766,-84.120857,12.263078,65.905449,0


In [29]:
dataset.shape

(284807, 15)

In [30]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [31]:
print(x)

[[  78.48202188   52.87198514  -64.91746453 ...  -83.95969688
    11.66697749  846.71003275]
 [  78.37908576   55.05818019  -68.15361935 ...  -84.88832263
    11.83679625   25.02533541]
 [  79.60992192   55.39515406  -61.50613884 ...  -85.1904108
    11.48329135 2127.5829864 ]
 ...
 [  83.09815938   54.06468252  -65.45730425 ...  -84.80077792
    11.6408345   389.59094774]
 [  82.87654542   52.79712981  -62.99592497 ...  -84.12085707
    12.26307847   65.90544932]
 [  74.59550158   54.05819369  -70.46320297 ...  -84.84553495
    11.83169249 1223.52290219]]


In [32]:
print(y)

[0 0 0 ... 0 0 0]


# Take care of missing data

In [33]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 0:14])
x[:, 0:14] = imputer.transform(x[:, 0:14])

In [34]:
print(x)

[[  78.48202188   52.87198514  -64.91746453 ...  -83.95969688
    11.66697749  846.71003275]
 [  78.37908576   55.05818019  -68.15361935 ...  -84.88832263
    11.83679625   25.02533541]
 [  79.60992192   55.39515406  -61.50613884 ...  -85.1904108
    11.48329135 2127.5829864 ]
 ...
 [  83.09815938   54.06468252  -65.45730425 ...  -84.80077792
    11.6408345   389.59094774]
 [  82.87654542   52.79712981  -62.99592497 ...  -84.12085707
    12.26307847   65.90544932]
 [  74.59550158   54.05819369  -70.46320297 ...  -84.84553495
    11.83169249 1223.52290219]]


# Splitting the dataset into the training set and test set

In [35]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [36]:
print(x_train)

[[ 84.51078462  53.71413034 -71.23976259 ... -84.8984747   11.87227673
   90.34403999]
 [ 76.40268396  54.8867908  -73.66229813 ... -85.45376052  11.9463948
   60.31309448]
 [ 77.42637309  53.97042892 -72.2777221  ... -85.28083181  11.54771745
   21.05476347]
 ...
 [ 82.19219775  53.90071672 -71.97743022 ... -85.8854966   11.91496555
   52.3719506 ]
 [ 79.27103989  56.20499161 -70.64158461 ... -84.86793014  11.67376971
  540.13714016]
 [ 78.29922243  54.81845865 -77.14287241 ... -87.34182308  10.79106674
  333.2200109 ]]


In [37]:
print(x_test)

[[ 73.77504132  55.24623973 -72.98889052 ... -84.39655824  12.81674929
   19.99221605]
 [ 81.03552054  55.48616274 -66.91962378 ... -83.77162212  12.04911976
   21.05476347]
 [ 77.24293229  56.01031117 -67.27725348 ... -83.2072968   12.20603802
   14.95909669]
 ...
 [ 76.66595275  57.31624259 -67.20815491 ... -84.64667571  11.55071863
   95.04161806]
 [ 78.03936632  56.29932157 -69.1561846  ... -84.95284708  11.56441404
  177.69662267]
 [ 78.56836876  51.72154476 -64.29463077 ... -84.9924216   11.8834892
  423.64838874]]


In [38]:
print(y_train)

[0 0 0 ... 0 0 0]


In [39]:
print(y_test)

[0 0 0 ... 0 0 0]


# Feature Scaling

In [40]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 14:] = sc.fit_transform(x_train[:, 14:])
x_test[:, 14:] = sc.transform(x_test[:, 14:])

ValueError: Found array with 0 feature(s) (shape=(199364, 0)) while a minimum of 1 is required by StandardScaler.

In [41]:
print(x_train)

[[ 84.51078462  53.71413034 -71.23976259 ... -84.8984747   11.87227673
   90.34403999]
 [ 76.40268396  54.8867908  -73.66229813 ... -85.45376052  11.9463948
   60.31309448]
 [ 77.42637309  53.97042892 -72.2777221  ... -85.28083181  11.54771745
   21.05476347]
 ...
 [ 82.19219775  53.90071672 -71.97743022 ... -85.8854966   11.91496555
   52.3719506 ]
 [ 79.27103989  56.20499161 -70.64158461 ... -84.86793014  11.67376971
  540.13714016]
 [ 78.29922243  54.81845865 -77.14287241 ... -87.34182308  10.79106674
  333.2200109 ]]


In [42]:
print(x_test)

[[ 73.77504132  55.24623973 -72.98889052 ... -84.39655824  12.81674929
   19.99221605]
 [ 81.03552054  55.48616274 -66.91962378 ... -83.77162212  12.04911976
   21.05476347]
 [ 77.24293229  56.01031117 -67.27725348 ... -83.2072968   12.20603802
   14.95909669]
 ...
 [ 76.66595275  57.31624259 -67.20815491 ... -84.64667571  11.55071863
   95.04161806]
 [ 78.03936632  56.29932157 -69.1561846  ... -84.95284708  11.56441404
  177.69662267]
 [ 78.56836876  51.72154476 -64.29463077 ... -84.9924216   11.8834892
  423.64838874]]
