# Case Study 6

## Description


Build a dense neural network to accurately detect the particle. The goal is to maximize your accuracy.
Include a discussion of how you know your model has finished training as well as what design decisions
you made while building the network.

Submit your assignment to the Assignments section of the online campus. For more information regarding
case studies, see the syllabus.

In [1]:
# Import general libraries
import joblib
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns

# Import sklearn libraries
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Set up OS-level processes
import os
cwd = os.getcwd()
d = os.path.dirname(cwd)

# Import tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Set pandas display options
pd.options.display.max_rows = 99999
pd.options.display.max_columns = 99999

In [2]:
# %%time

# # Read in data and save it as a pickle file
# data = pd.read_csv('all_train.csv')
# data.to_pickle("data.pkl")

In [3]:
# Load pickle file
df = pd.read_pickle("data.pkl")

## Exploratory Data Analysis

In [5]:
df.shape

(7000000, 29)

In [6]:
df.head()

Unnamed: 0,# label,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,mass
0,1.0,-0.346368,0.416306,0.999236,0.475342,0.427493,-0.005984,1.989833,0.34453,1.566297,0.754261,2.269852,-1.300984,-1.19942,-1.161915,3.357043,0.460796,-0.404118,-0.81544,4.105282,0.267826,0.378718,1.743123,3.406367,4.350537,-0.352571,1.130032,2.227706,1000.0
1,1.0,1.708236,-0.319394,-1.241873,-0.887231,-0.871906,-0.005984,-0.001047,-1.038225,0.655748,0.754261,0.459217,-0.69564,0.359856,-1.161915,-0.502048,0.505149,-1.500441,1.226331,-1.178141,-0.877361,-1.483769,-0.573682,-1.693781,-0.545062,-0.299118,-0.662942,-0.193019,750.0
2,0.0,-0.360693,1.794174,0.264738,-0.472273,-0.292344,-1.054221,-1.150495,1.423404,1.270098,0.754261,-1.031239,1.912465,-1.321417,-1.161915,-0.406757,1.346497,-0.298099,1.226331,-1.199511,0.53902,-1.590629,-0.573682,-0.543636,-0.937456,-0.300344,-0.523262,-1.506304,750.0
3,1.0,-0.377914,-0.103932,-0.649434,-2.125015,-1.643797,-0.005984,1.011112,-1.04034,-0.541991,0.754261,1.142262,0.551081,1.016731,0.860649,0.454005,0.624624,1.439072,-0.81544,0.463763,-0.006583,1.089122,-0.573682,-0.276348,-0.409272,-0.349926,-0.307123,0.529698,1250.0
4,0.0,-0.067436,-0.636762,-0.620166,-0.062551,1.588715,-0.005984,-0.595304,-1.238987,0.336844,-1.325801,-0.181512,-1.329374,0.879087,0.860649,-0.143564,-1.293935,-1.2228,-0.81544,-0.552837,-1.418494,-0.562982,1.743123,0.881802,0.002516,1.56095,-0.15076,-1.023889,750.0


In [7]:
df.tail()

Unnamed: 0,# label,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,mass
6999995,0.0,1.617264,-0.537084,-1.275867,0.650799,-1.511621,0.850488,0.596391,-0.054678,0.728849,0.754261,0.642273,-0.830321,-0.135583,-1.161915,-0.080789,-0.38723,0.395537,-0.81544,0.664328,-0.960709,-0.894011,1.743123,-0.426198,-0.324286,-0.432739,1.340297,0.267774,750.0
6999996,0.0,-0.511357,0.270927,0.085989,-0.243802,-1.035668,-0.005984,-0.127219,0.721426,1.404479,-1.325801,-0.116903,1.924123,-0.324101,0.860649,0.435727,-0.307681,0.520873,1.226331,-1.015462,1.367217,-1.053815,-0.573682,-1.907798,0.194661,-0.190621,0.027776,-0.316018,1250.0
6999997,1.0,0.062408,-0.987203,0.570667,1.517195,0.639548,-1.054221,1.115239,1.261928,-1.009308,0.754261,0.606688,-0.671159,0.910398,0.860649,1.012521,1.395015,-1.292659,-0.81544,0.790842,0.892545,-0.192816,-0.573682,0.973622,1.034964,-0.340661,-0.181193,1.877042,1500.0
6999998,1.0,1.659131,1.096223,0.562821,1.627193,0.767236,-1.054221,1.079999,0.155488,-1.412207,-1.325801,1.426934,0.010469,-0.912317,0.860649,1.287494,0.457677,0.222345,1.226331,-0.671423,-0.308908,-0.568336,-0.573682,1.043119,1.27035,0.217405,0.120213,1.07302,1500.0
6999999,1.0,0.002034,0.744152,-0.908839,-0.770454,1.008405,-1.054221,-0.370155,0.296837,-1.492524,0.754261,-0.060033,0.480335,0.600689,0.860649,-0.310234,1.44498,-0.900647,-0.81544,0.352893,0.671047,0.176512,-0.573682,0.314739,0.304983,0.425471,-0.612085,-0.925097,499.999969


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000000 entries, 0 to 6999999
Data columns (total 29 columns):
 #   Column   Dtype  
---  ------   -----  
 0   # label  float64
 1   f0       float64
 2   f1       float64
 3   f2       float64
 4   f3       float64
 5   f4       float64
 6   f5       float64
 7   f6       float64
 8   f7       float64
 9   f8       float64
 10  f9       float64
 11  f10      float64
 12  f11      float64
 13  f12      float64
 14  f13      float64
 15  f14      float64
 16  f15      float64
 17  f16      float64
 18  f17      float64
 19  f18      float64
 20  f19      float64
 21  f20      float64
 22  f21      float64
 23  f22      float64
 24  f23      float64
 25  f24      float64
 26  f25      float64
 27  f26      float64
 28  mass     float64
dtypes: float64(29)
memory usage: 1.5 GB


In [9]:
df.describe()

Unnamed: 0,# label,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,mass
count,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0
mean,0.5001256,0.01612528,0.0004770022,2.686578e-05,0.01056081,-0.0001050026,0.002765919,0.01815953,2.510948e-05,0.000434587,-0.006869956,0.01754267,-0.0001610605,-0.0003289412,0.001738506,0.01724634,0.0004830252,-0.0005536178,0.004960189,0.01164789,-0.0001127097,7.686731e-05,0.0002909202,0.01228774,0.009778378,0.005269844,-0.001760961,0.01533136,1000.107
std,0.5,1.004417,0.9974864,1.00008,0.9956003,0.999867,1.000957,0.9867746,0.9965867,1.000007,1.001938,0.9941511,0.9984497,1.000078,0.9997368,0.9994654,0.9984289,0.9998608,1.001006,1.002725,1.000038,1.000033,1.00017,1.010477,1.005418,1.00999,0.9844511,0.9822799,353.4255
min,0.0,-1.960549,-2.365355,-1.732165,-9.980274,-1.732137,-1.054221,-3.034787,-2.757853,-1.732359,-1.325801,-2.835563,-2.602091,-1.732216,-1.161915,-2.454879,-2.437812,-1.732145,-0.8154401,-1.728284,-2.281867,-1.731758,-0.5736825,-3.631608,-4.729473,-20.62223,-3.452634,-2.632761,500.0
25%,0.0,-0.7288206,-0.7332548,-0.8656704,-0.6092291,-0.8658025,-1.054221,-0.7566092,-0.7014146,-0.8656543,-1.325801,-0.7237266,-0.7032926,-0.8665987,-1.161915,-0.6996179,-0.7070255,-0.866247,-0.8154401,-0.742363,-0.7206846,-0.8656855,-0.5736825,-0.5417942,-0.5115522,-0.354387,-0.6925097,-0.7943804,750.0
50%,1.0,-0.03930319,0.0008523957,0.0003199154,0.01963316,-0.0005070131,-0.005983562,-0.1499527,-0.0001067553,0.001384781,0.7542607,-0.1285732,-0.000575542,-0.001282098,0.8606486,-0.09749269,0.0002975658,-0.001376716,-0.8154401,-0.08992496,-6.735953e-05,-0.0004424527,-0.5736825,-0.160276,-0.3144032,-0.3265228,-0.3570301,-0.0882864,1000.0
75%,1.0,0.6900799,0.7347832,0.8659464,0.6798818,0.8657646,0.8504885,0.768669,0.7013194,0.8665976,0.7542607,0.6478635,0.7041004,0.8658323,0.8606486,0.6347052,0.7083709,0.8649424,1.226331,0.6423185,0.7204921,0.8659566,-0.5736825,0.4812194,0.1634892,-0.2337671,0.4753128,0.7610846,1250.0
max,1.0,4.378282,2.365287,1.73237,4.148023,1.731978,4.482618,3.720345,2.75859,1.73145,0.7542607,4.639335,2.602294,1.732007,0.8606486,5.535799,2.438369,1.732738,1.226331,5.866367,2.282217,1.73274,1.743123,7.29342,9.333287,14.99064,5.277313,4.44469,1500.0


In [10]:
# Missing values?
df.isnull().sum()

# label    0
f0         0
f1         0
f2         0
f3         0
f4         0
f5         0
f6         0
f7         0
f8         0
f9         0
f10        0
f11        0
f12        0
f13        0
f14        0
f15        0
f16        0
f17        0
f18        0
f19        0
f20        0
f21        0
f22        0
f23        0
f24        0
f25        0
f26        0
mass       0
dtype: int64

In [11]:
# Number of unique values per feature
df.nunique()

# label          2
f0         1142885
f1         2311224
f2         1849439
f3         1359060
f4         1849442
f5              10
f6         1129730
f7         2383592
f8         1850813
f9               2
f10        1063674
f11        2358787
f12        1850048
f13              2
f14         979658
f15        2320116
f16        1851285
f17              2
f18         882518
f19        2263637
f20        1850152
f21              2
f22        1142839
f23         568614
f24         393987
f25         862612
f26         931270
mass             5
dtype: int64

In [12]:
df['# label'].value_counts()

1.0    3500879
0.0    3499121
Name: # label, dtype: int64

In [21]:
# Convert label to an integer
df['# label'] = df['# label'].astype(int)

# Verify conversion
df['# label'].value_counts()

1    3500879
0    3499121
Name: # label, dtype: int64

## Set up Train/Test Data

In [20]:
# Define X & y
X, y = df.drop('# label', axis = 1), df['# label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.10, 
                                                    shuffle = True,
                                                    random_state = 123, 
                                                    stratify = y)

# Define and scale the testing and training data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Build Initial Neural Network

In [22]:
# Build a sequential NN model using Tensorflow and Keras
model = Sequential()
model.add(InputLayer(input_shape = (28,)))
model.add(Dense(units = 100, activation = 'relu'))
model.add(Dense(units = 100, activation = 'relu'))
model.add(Dense(units = 1,   activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam')

2022-07-05 20:19:19.066612: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
%%time

model.fit(x = X_train, 
          y = y_train, 
          batch_size = 100000,
          epochs = 1000,
          validation_data = (X_test, y_test), verbose = 1,
          callbacks = [EarlyStopping(monitor = 'val_loss', 
                                     mode = 'min', 
                                     verbose = 1, 
                                     patience = 15)]
          )