In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn import metrics

import keras
from keras.models import Sequential
from keras.layers import Dense

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


### 1. Read data

In [2]:
df = pd.read_csv('advertising-1.csv')
df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.90,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.50,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,Fundamental modular algorithm,Duffystad,1,Lebanon,2016-02-11 21:49:00,1
996,51.30,45,67782.17,134.42,Grass-roots cohesive monitoring,New Darlene,1,Bosnia and Herzegovina,2016-04-22 02:07:01,1
997,51.63,51,42415.72,120.37,Expanded intangible solution,South Jessica,1,Mongolia,2016-02-01 17:24:57,1
998,55.55,19,41920.79,187.95,Proactive bandwidth-monitored policy,West Steven,0,Guatemala,2016-03-24 02:35:54,0


### 2. Data exploration

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.2+ KB


In [4]:
df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0002,36.009,55000.00008,180.0001,0.481,0.5
std,15.853615,8.785562,13414.634022,43.902339,0.499889,0.50025
min,32.6,19.0,13996.5,104.78,0.0,0.0
25%,51.36,29.0,47031.8025,138.83,0.0,0.0
50%,68.215,35.0,57012.3,183.13,0.0,0.5
75%,78.5475,42.0,65470.635,218.7925,1.0,1.0
max,91.43,61.0,79484.8,269.96,1.0,1.0


In [5]:
df.isna().sum() # => nno missing value found

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [6]:
df['Ad Topic Line'].value_counts()

Open-source global strategy                     1
Reactive demand-driven strategy                 1
Realigned tangible collaboration                1
Reverse-engineered dynamic function             1
Ameliorated contextually-based collaboration    1
                                               ..
Triple-buffered scalable groupware              1
Cross-platform multimedia algorithm             1
Universal multi-state system engine             1
Sharable grid-enabled matrix                    1
Multi-tiered interactive neural-net             1
Name: Ad Topic Line, Length: 1000, dtype: int64

In [7]:
df['City'].value_counts()

Lisamouth             3
Williamsport          3
West Amanda           2
Millerbury            2
Michelleside          2
                     ..
North Virginia        1
South Pamela          1
Port Brittanyville    1
South Davidmouth      1
Adamsbury             1
Name: City, Length: 969, dtype: int64

In [8]:
df['Country'].value_counts()

France              9
Czech Republic      9
South Africa        8
Liberia             8
Peru                8
                   ..
Mozambique          1
Bermuda             1
Marshall Islands    1
Kiribati            1
Jordan              1
Name: Country, Length: 237, dtype: int64

### 3. Data transformation

In [9]:
# create features from Ad topic by splitting text and then hot encoding
desc_features = (pd.get_dummies(df['Ad Topic Line'].str.split(expand=True))
         .groupby(lambda x: x.split('_')[-1],axis=1).sum())
desc_features

Unnamed: 0,24/7,24hour,3rdgeneration,4thgeneration,5thgeneration,6thgeneration,Adaptive,Advanced,Ameliorated,Area,...,user,user-facing,utilization,value-added,web-enabled,website,well-modulated,workforce,zero,zero-defect
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
for column in desc_features.columns:
    df_new = desc_features[desc_features[column] > 0]
    if len(df_new) <= 1:
        desc_features.drop([column], axis=1, inplace=True)
        
desc_features

Unnamed: 0,24/7,24hour,3rdgeneration,4thgeneration,5thgeneration,6thgeneration,Adaptive,Advanced,Ameliorated,Area,...,user,user-facing,utilization,value-added,web-enabled,website,well-modulated,workforce,zero,zero-defect
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
desc_features[desc_features['Adaptive'] > 0]

Unnamed: 0,24/7,24hour,3rdgeneration,4thgeneration,5thgeneration,6thgeneration,Adaptive,Advanced,Ameliorated,Area,...,user,user-facing,utilization,value-added,web-enabled,website,well-modulated,workforce,zero,zero-defect
153,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
178,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
319,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
363,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
543,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
556,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# combine desc_features with main df
df = df.join(desc_features)
# drop column now
df.drop(['Ad Topic Line'], axis=1, inplace=True)
df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Male,Country,Timestamp,Clicked on Ad,24/7,...,user,user-facing,utilization,value-added,web-enabled,website,well-modulated,workforce,zero,zero-defect
0,68.95,35,61833.90,256.09,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0,0,...,0,0,0,0,0,0,0,0,0,0
1,80.23,31,68441.85,193.77,West Jodi,1,Nauru,2016-04-04 01:39:02,0,0,...,0,0,0,0,0,0,0,0,0,0
2,69.47,26,59785.94,236.50,Davidton,0,San Marino,2016-03-13 20:35:42,0,0,...,0,0,0,0,0,0,0,0,0,0
3,74.15,29,54806.18,245.89,West Terrifurt,1,Italy,2016-01-10 02:31:19,0,0,...,0,0,0,0,0,0,0,0,0,0
4,68.37,35,73889.99,225.58,South Manuel,0,Iceland,2016-06-03 03:36:18,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,Duffystad,1,Lebanon,2016-02-11 21:49:00,1,0,...,0,0,0,0,0,0,0,0,0,0
996,51.30,45,67782.17,134.42,New Darlene,1,Bosnia and Herzegovina,2016-04-22 02:07:01,1,0,...,0,0,0,0,0,0,0,0,0,0
997,51.63,51,42415.72,120.37,South Jessica,1,Mongolia,2016-02-01 17:24:57,1,0,...,0,0,0,0,0,0,0,0,0,0
998,55.55,19,41920.79,187.95,West Steven,0,Guatemala,2016-03-24 02:35:54,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# encoding for city attribute
encoded_columns_city = pd.get_dummies(df['City'], prefix='city')
for column in encoded_columns_city.columns:
    df_new = encoded_columns_city[encoded_columns_city[column] > 0]
    if len(df_new) <= 1:
        encoded_columns_city.drop([column], axis=1, inplace=True) # drop column if they appear in atmost 1 row
df = df.join(encoded_columns_city)

# encoding for country attribute
encoded_columns_country = pd.get_dummies(df['Country'], prefix='country')
for column in encoded_columns_country.columns:
    df_new = encoded_columns_country[encoded_columns_country[column] > 0]
    if len(df_new) <= 1:
        encoded_columns_country.drop([column], axis=1, inplace=True) # drop column if they appear in atmost 1 row
df = df.join(encoded_columns_country)

# drop column now
df.drop(['City', 'Country'], axis=1, inplace=True)

df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Timestamp,Clicked on Ad,24/7,24hour,3rdgeneration,...,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Wallis and Futuna,country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe
0,68.95,35,61833.90,256.09,0,2016-03-27 00:53:11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,80.23,31,68441.85,193.77,1,2016-04-04 01:39:02,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,69.47,26,59785.94,236.50,0,2016-03-13 20:35:42,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,74.15,29,54806.18,245.89,1,2016-01-10 02:31:19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,68.37,35,73889.99,225.58,0,2016-06-03 03:36:18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,1,2016-02-11 21:49:00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,51.30,45,67782.17,134.42,1,2016-04-22 02:07:01,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,51.63,51,42415.72,120.37,1,2016-02-01 17:24:57,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,55.55,19,41920.79,187.95,0,2016-03-24 02:35:54,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Transform timestamp
todaydate = datetime.today() #This will be needed for analysing the date type attributes
todaydate

datetime.datetime(2021, 4, 21, 12, 39, 43, 519245)

In [15]:
df['timestamp_delta'] = (todaydate - pd.to_datetime(df['Timestamp'])).dt.days

# drop column now
df.drop(['Timestamp'], axis=1, inplace=True)
df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,24/7,24hour,3rdgeneration,4thgeneration,...,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Wallis and Futuna,country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe,timestamp_delta
0,68.95,35,61833.90,256.09,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1851
1,80.23,31,68441.85,193.77,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1843
2,69.47,26,59785.94,236.50,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1864
3,74.15,29,54806.18,245.89,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1928
4,68.37,35,73889.99,225.58,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1895
996,51.30,45,67782.17,134.42,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1825
997,51.63,51,42415.72,120.37,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1905
998,55.55,19,41920.79,187.95,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1854


In [16]:
# Scaling & Normalization

for col in df.columns:
    df[col] = (df[col] - df[col].min())/(df[col].max() - df[col].min())

df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,24/7,24hour,3rdgeneration,4thgeneration,...,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Wallis and Futuna,country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe,timestamp_delta
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.550743,0.404976,0.626119,0.455383,0.481,0.5,0.011,0.008,0.008,0.008,...,0.002,0.006,0.007,0.003,0.004,0.007,0.003,0.004,0.006,0.510205
std,0.269482,0.20918,0.20484,0.265785,0.499889,0.50025,0.104355,0.089129,0.089129,0.089129,...,0.044699,0.077266,0.083414,0.054717,0.063151,0.083414,0.054717,0.063151,0.077266,0.287349
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.318885,0.238095,0.504446,0.206139,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263415
50%,0.605388,0.380952,0.656847,0.474331,0.0,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.521951
75%,0.781022,0.547619,0.786005,0.690232,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.765854
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,24/7,24hour,3rdgeneration,4thgeneration,...,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Wallis and Futuna,country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe,timestamp_delta
0,0.617882,0.380952,0.730472,0.916031,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.580488
1,0.809621,0.285714,0.831375,0.538746,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541463
2,0.626721,0.166667,0.699200,0.797433,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.643902
3,0.706272,0.238095,0.623160,0.854280,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.956098
4,0.608023,0.380952,0.914568,0.731323,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.686215,0.261905,0.876310,0.628405,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.795122
996,0.317865,0.619048,0.821302,0.179441,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.453659
997,0.323474,0.761905,0.433959,0.094382,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.843902
998,0.390107,0.000000,0.426401,0.503511,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.595122


In [18]:
# select only those features which are correlated to status col
threshold  =  0.05
corr_matrix = df.corr().abs()
print(corr_matrix['Clicked on Ad'])
for col in df.columns:
    if (corr_matrix['Clicked on Ad'][col] <  threshold):
        # if corr less than threshold, drop  column
        df. drop([col], axis=1, inplace=True) # Reduce dimensionality

print()
df.info()

Daily Time Spent on Site    0.748117
Age                         0.492531
Area Income                 0.476255
Daily Internet Usage        0.786539
Male                        0.038027
                              ...   
country_Western Sahara      0.011994
country_Yemen               0.018285
country_Zambia              0.031686
country_Zimbabwe            0.025898
timestamp_delta             0.014980
Name: Clicked on Ad, Length: 578, dtype: float64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 67 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   float64
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Clicked on Ad             1000 non-null   float64
 5   Adaptive                  1000 non-null   float

### 4. Train & test split

In [19]:
# Extract X & Y columns
X = df.drop(['Clicked on Ad'], axis=1).values
Y = df['Clicked on Ad'].values

In [20]:
X

array([[0.61788203, 0.38095238, 0.73047247, ..., 0.        , 0.        ,
        0.        ],
       [0.80962094, 0.28571429, 0.83137522, ..., 0.        , 0.        ,
        0.        ],
       [0.62672106, 0.16666667, 0.69920032, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.32347442, 0.76190476, 0.43395874, ..., 0.        , 0.        ,
        0.        ],
       [0.39010709, 0.        , 0.4264012 , ..., 0.        , 0.        ,
        0.        ],
       [0.2109468 , 0.16666667, 0.24247537, ..., 0.        , 0.        ,
        0.        ]])

In [21]:
Y

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1.,
       1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1.,
       0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1.,
       1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1.,
       0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0.,
       1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1.

In [22]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.3, random_state = 323)

### 5. Create NN classifier

In [23]:
# initialize classifier
classifier = Sequential()

# define dimension
inp_dimension = len(df.columns)-1
out_dimension = int((inp_dimension + 1)/2)

# # first hidden layer
classifier.add(Dense(out_dimension, activation = 'relu', input_dim = inp_dimension))

# # second hidden layer
classifier.add(Dense(out_dimension, activation = 'relu'))

# output layer
classifier.add(Dense(1, activation = 'sigmoid')) # since it is a binary classification

In [24]:
# compiling ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

### 6. Train the model

In [25]:
classifier.fit(train_X, train_Y, batch_size = 10,epochs = 100)

Epoch 1/100
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpypez040m.py, line 48)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpypez040m.py, line 48)
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46

Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7fde414f5790>

### 7. Prediction

In [26]:
predict_Y = classifier.predict(test_X)
predict_Y = (predict_Y > 0.5).astype(float)
predict_Y

array([[0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],

### 8. Evaluation

In [27]:
conf_matrix = confusion_matrix(test_Y, predict_Y)
conf_matrix

array([[146,  13],
       [  8, 133]])

In [28]:
print(classification_report(test_Y, predict_Y))

              precision    recall  f1-score   support

         0.0       0.95      0.92      0.93       159
         1.0       0.91      0.94      0.93       141

    accuracy                           0.93       300
   macro avg       0.93      0.93      0.93       300
weighted avg       0.93      0.93      0.93       300

