In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import torch 
from torch import nn
from torch import optim 

pd.options.mode.chained_assignment = None



In [2]:
df = pd.read_csv("./bank-additional-full.csv", delimiter = ";")
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [4]:
# do not need to consider economic factors because most people are not aware lol 

df = df[df.columns[~df.columns.isin(['emp.var.rate', 'cons.price.idx','cons.conf.idx', 'euribor3m', 'nr.employed'])]]
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,no


In [5]:
df['default'].value_counts()

no         32588
unknown     8597
yes            3
Name: default, dtype: int64

In [6]:
df['housing'].value_counts()

yes        21576
no         18622
unknown      990
Name: housing, dtype: int64

In [7]:
df['loan'].value_counts()

no         33950
yes         6248
unknown      990
Name: loan, dtype: int64

In [8]:
# since default has barely any yes answers, we will make the simplifying assumption that most people do not default
# and will remove the column to avoid any issues 

df = df.drop(columns = ['default'])

# we will also drop the unknowns from the housing and loans columns as we have sufficient numbers without them 

df = df[(df['housing'] != 'unknown') & (df['loan'] != 'unknown')]

In [9]:
# in order to run a neural network, need to make all variables numeric. Will do so by turning categorical into dummy variables
# and turning binary y/n into 1/0 

# getting necessary dummy variables

df_dummy = pd.get_dummies(df, columns = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week', 'poutcome'], prefix_sep = ': ')

# turning y/n variables into 1/0

df_dummy['housing'] = np.where(df['housing'].values == 'yes', 1, 0)
df_dummy['loan'] = np.where(df['loan'].values == 'yes', 1, 0)
df_dummy['y'] = np.where(df['y'].values == 'yes', 1, 0)

df_dummy.head(5)

Unnamed: 0,age,housing,loan,duration,campaign,pdays,previous,y,job: admin.,job: blue-collar,...,month: oct,month: sep,day_of_week: fri,day_of_week: mon,day_of_week: thu,day_of_week: tue,day_of_week: wed,poutcome: failure,poutcome: nonexistent,poutcome: success
0,56,0,0,261,1,999,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,57,0,0,149,1,999,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,0,226,1,999,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,0,0,151,1,999,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
4,56,0,1,307,1,999,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [51]:
# splitting data into training and test split using scikit-learn 

X = df_dummy[df_dummy.columns[~df_dummy.columns.isin(['y'])]]
y = df_dummy['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .70)

In [64]:
# converting DF into Tensor

X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()

X_train_tensor = torch.from_numpy(X_train_np).to(torch.float32)
y_train_tensor = torch.from_numpy(y_train_np).reshape(-1, 1).to(torch.float32)

In [65]:
# now we are ready to build the model using PyTorch 

num_inputs = X_train_tensor.shape[1]

model = nn.Sequential(
    nn.Linear(num_inputs, 60),
    nn.ReLU(),
    nn.Linear(60, num_inputs),
    nn.ReLU(),
    nn.Linear(num_inputs, 1),
    nn.Sigmoid())

In [66]:
# loss calculation with binary cross entropy, optimizing using Adam package in Pytorch

loss_fn = nn.BCELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [67]:
# training the model using 100 epochs, each with batch size of 10 

n_epochs = 100
batch_size = 10
 
for epoch in range(n_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        Xbatch = X_train_tensor[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y_train_tensor[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')

Finished epoch 0, latest loss 0.150199756026268
Finished epoch 1, latest loss 0.13368821144104004
Finished epoch 2, latest loss 0.11794660985469818
Finished epoch 3, latest loss 0.12237422168254852
Finished epoch 4, latest loss 0.11047623306512833
Finished epoch 5, latest loss 0.10534490644931793
Finished epoch 6, latest loss 0.09541436284780502
Finished epoch 7, latest loss 0.10601639747619629
Finished epoch 8, latest loss 0.09795502573251724
Finished epoch 9, latest loss 0.13862000405788422
Finished epoch 10, latest loss 0.10899769514799118
Finished epoch 11, latest loss 0.10173819214105606
Finished epoch 12, latest loss 0.09609313309192657
Finished epoch 13, latest loss 0.09202457219362259
Finished epoch 14, latest loss 0.09316756576299667
Finished epoch 15, latest loss 0.08885981142520905
Finished epoch 16, latest loss 0.09467796981334686
Finished epoch 17, latest loss 0.10972697287797928
Finished epoch 18, latest loss 0.09124083071947098
Finished epoch 19, latest loss 0.0876029506

In [68]:
# computing training accuracy 

with torch.no_grad():
    y_pred = model(X_train_tensor)

accuracy = (y_pred.round() == y_train_tensor).float().mean()
print(f"Accuracy {accuracy}")

Accuracy 0.9072784185409546


In [69]:
# Converting DF to Tensor 

X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy()

X_test_tensor = torch.from_numpy(X_test_np).to(torch.float32)
y_test_tensor = torch.from_numpy(y_test_np).reshape(-1, 1).to(torch.float32)

In [70]:
# computing test accuracy 

with torch.no_grad():
    y_pred = model(X_test_tensor)

accuracy = (y_pred.round() == y_test_tensor).float().mean()
print(f"Accuracy {accuracy}")

Accuracy 0.909121036529541
