In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm


import random
import torch
from torch import nn

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [11]:
cd content/sample_data

/content/sample_data


In [12]:
data = pd.read_csv('preprocessed_data.csv')

In [13]:
data.head()

Unnamed: 0,Age,BusinessTravel,DistanceFromHome,Education,Gender,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Attrition
0,0.785714,0.5,0.178571,0.25,0.0,0.0,0.637546,0.111111,0.0,0.0,0.025,1.0,0.025,0.0,0.0,0.666667,1.0,0.333333,0.666667,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,0.309524,1.0,0.321429,0.0,0.0,0.0,0.167457,0.0,0.857143,0.333333,0.15,0.5,0.125,0.066667,0.235294,0.666667,0.333333,1.0,0.333333,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
2,0.333333,1.0,0.571429,0.75,1.0,0.75,0.964666,0.111111,0.285714,1.0,0.125,0.333333,0.125,0.0,0.176471,0.333333,0.333333,0.0,0.666667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3,0.47619,0.0,0.035714,1.0,1.0,0.5,0.385045,0.333333,0.0,1.0,0.325,0.833333,0.2,0.466667,0.294118,1.0,1.0,0.666667,0.333333,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0.333333,0.5,0.321429,0.0,1.0,0.0,0.070195,0.444444,0.071429,0.666667,0.225,0.333333,0.15,0.0,0.235294,1.0,0.0,0.666667,0.666667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0


In [14]:
from torch.utils.data import DataLoader, Dataset

In [15]:
class DataSet(Dataset):
    def __init__(self, X, y=None, mode='train'):
        self.X = X
        self.y = y
        self.mode=mode
        
    def __len__(self,):
        return len(self.X)
    
    def __getitem__(self, idx):
        xi = torch.tensor(self.X[idx]).float()
        if self.mode!='test':
            yi = torch.tensor(self.y[idx]).long()
            return xi, yi
        else:
            return xi

In [16]:
X = data.drop('Attrition', 1).to_numpy()
y = data['Attrition'].to_numpy()

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from torchvision.transforms import ToTensor

In [20]:
trainset = DataSet(X_train, y_train, mode='train')
valset = DataSet(X_test, y_test, mode='val')

traingen = DataLoader(trainset, batch_size=32, shuffle=False)
valgen = DataLoader(valset, batch_size=32, shuffle=False)

In [21]:
from collections import OrderedDict

class SimpleNN_v1(nn.Module):
    def __init__(self,):
        super().__init__()
        self.block0=nn.Sequential(
                nn.Linear(41, 100),
                nn.ReLU())
        self.dropout=nn.Dropout(0.2)
        self.block1=nn.Sequential(
                nn.Linear(100, 50),
                nn.BatchNorm1d(50),
                nn.ReLU())
        self.fc=nn.Linear(50, 1)
        
    def forward(self, x):
        x = self.block0(x)
        x = self.dropout(x)
        x = self.block1(x)
        return self.fc(x)

In [30]:
class SimpleNN_v2(nn.Module):
    def __init__(self,):
        super().__init__()
        self.block0=nn.Sequential(
                nn.Linear(41, 100),
                nn.ReLU())
        self.dropout=nn.Dropout(0.2)
        self.block1=nn.Sequential(
                nn.Linear(100, 50),
                nn.BatchNorm1d(50),
                nn.ReLU())
        self.block2=nn.Sequential(
                nn.Linear(50, 50),
                nn.BatchNorm1d(50),
                nn.ReLU())
        self.fc=nn.Linear(50, 1)
        
    def forward(self, x):
        x = self.block0(x)
        x = self.dropout(x)
        x = self.block1(x)
        x = self.block2(x)
        return self.fc(x)

In [22]:
def train(model, criterion, optimizer, scheduler, metric, tgen, vgen, epochs=10, device='cuda:0'):
    cache={'loss':[], 'acc': [], 'v_loss':[], 'v_acc':[], 'epoch':[]}
    for epoch in range(1, epochs+1):
        log = f'::: Epoch {epoch}/{epochs} :::'
        model.train()
        r_loss, r_acc = 0, 0
        for x, y in tgen:
            x, y = x.to(device), y.to(device)
            y_ = model(x)     
            loss = criterion(y_.view(-1), y.float())
            acc = metric(y_.view(-1), y)
            loss.backward()
            
            # Weights update
            optimizer.step()
            optimizer.zero_grad()
            # Weights update
            r_loss += loss.item()
            r_acc += acc.item()

        e_loss, e_acc = r_loss/len(tgen), r_acc/len(tgen)
        cache['loss'].append(e_loss)
        cache['acc'].append(e_acc)
        cache['epoch'].append(epoch)
        template = f' Train ::: Loss: {e_loss:.4f} ::: F1 score: {e_acc:.4f}'
        print(log + template)
        model.eval()
        r_loss, r_acc = 0, 0
        for x, y in vgen:
            x, y = x.to(device), y.to(device)
            y_ = model(x)
            loss, acc = criterion(y_.view(-1), y.float()), metric(y_.view(-1), y)

            r_loss += loss.item()
            r_acc += acc.item()

        e_loss, e_acc = r_loss/len(vgen), r_acc/len(vgen)
        cache['v_loss'].append(e_loss)
        cache['v_acc'].append(e_acc)
        template = f' Validation ::: Loss: {e_loss:.4f} ::: F1 score: {e_acc:.4f}'
        print(log + template)
    return cache, model

In [23]:
def f1_score(pred, target, epsilon=1e-8):
    from sklearn.metrics import f1_score
    pred = (pred > 0).long().cpu().numpy()
    target = target.cpu().numpy()
    return f1_score(pred, target)

In [24]:
set_seed()
model = SimpleNN_v1().to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler=torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.9)

In [25]:
%%time
history, model = train(model, criterion, optimizer, scheduler, f1_score, traingen, valgen, epochs=30)

::: Epoch 1/30 ::: Train ::: Loss: 0.4782 ::: F1 score: 0.1514
::: Epoch 1/30 ::: Validation ::: Loss: 0.4231 ::: F1 score: 0.0854
::: Epoch 2/30 ::: Train ::: Loss: 0.3771 ::: F1 score: 0.1133
::: Epoch 2/30 ::: Validation ::: Loss: 0.3834 ::: F1 score: 0.2210
::: Epoch 3/30 ::: Train ::: Loss: 0.3463 ::: F1 score: 0.2508
::: Epoch 3/30 ::: Validation ::: Loss: 0.3620 ::: F1 score: 0.3475
::: Epoch 4/30 ::: Train ::: Loss: 0.3219 ::: F1 score: 0.3655
::: Epoch 4/30 ::: Validation ::: Loss: 0.3303 ::: F1 score: 0.4276
::: Epoch 5/30 ::: Train ::: Loss: 0.2889 ::: F1 score: 0.4672
::: Epoch 5/30 ::: Validation ::: Loss: 0.3052 ::: F1 score: 0.4601
::: Epoch 6/30 ::: Train ::: Loss: 0.2648 ::: F1 score: 0.5249
::: Epoch 6/30 ::: Validation ::: Loss: 0.2750 ::: F1 score: 0.5445
::: Epoch 7/30 ::: Train ::: Loss: 0.2379 ::: F1 score: 0.6443
::: Epoch 7/30 ::: Validation ::: Loss: 0.2631 ::: F1 score: 0.6030
::: Epoch 8/30 ::: Train ::: Loss: 0.2101 ::: F1 score: 0.6975
::: Epoch 8/30 ::: V

In [37]:
#torch.save(model, 'fcnn_v1.h5')

In [26]:
def predict(model, testgen, device='cuda:0'):
    model.eval()
    pred = []
    true = []
    for x, y in testgen:
        x, y = x.to(device), y.to(device)
        y_ = model(x)
        true += list(torch.flatten(y.data).cpu().numpy())
        pred += list(torch.flatten(y_.data).cpu().numpy())
    return torch.tensor(pred), torch.tensor(true)

In [27]:
%%time
pred, true = predict(model, valgen)

CPU times: user 36.3 ms, sys: 1.02 ms, total: 37.3 ms
Wall time: 41.8 ms


In [28]:
f1_score(pred, true)

0.9608540925266905

In [29]:
sum(p.numel() for p in model.parameters())

9401

In [31]:
set_seed()
model_v2 = SimpleNN_v2().to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model_v2.parameters(), lr=0.001)
scheduler=torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.9)

In [32]:
%%time
history_v2, model_v2 = train(model_v2, criterion, optimizer, scheduler, f1_score, traingen, valgen, epochs=30)

::: Epoch 1/30 ::: Train ::: Loss: 0.4517 ::: F1 score: 0.0882
::: Epoch 1/30 ::: Validation ::: Loss: 0.3947 ::: F1 score: 0.1933
::: Epoch 2/30 ::: Train ::: Loss: 0.3730 ::: F1 score: 0.2273
::: Epoch 2/30 ::: Validation ::: Loss: 0.3644 ::: F1 score: 0.3322
::: Epoch 3/30 ::: Train ::: Loss: 0.3410 ::: F1 score: 0.3644
::: Epoch 3/30 ::: Validation ::: Loss: 0.3359 ::: F1 score: 0.4166
::: Epoch 4/30 ::: Train ::: Loss: 0.3067 ::: F1 score: 0.4669
::: Epoch 4/30 ::: Validation ::: Loss: 0.3059 ::: F1 score: 0.4641
::: Epoch 5/30 ::: Train ::: Loss: 0.2730 ::: F1 score: 0.5559
::: Epoch 5/30 ::: Validation ::: Loss: 0.2671 ::: F1 score: 0.5023
::: Epoch 6/30 ::: Train ::: Loss: 0.2408 ::: F1 score: 0.6406
::: Epoch 6/30 ::: Validation ::: Loss: 0.2338 ::: F1 score: 0.5815
::: Epoch 7/30 ::: Train ::: Loss: 0.2093 ::: F1 score: 0.6984
::: Epoch 7/30 ::: Validation ::: Loss: 0.2075 ::: F1 score: 0.6713
::: Epoch 8/30 ::: Train ::: Loss: 0.1821 ::: F1 score: 0.7425
::: Epoch 8/30 ::: V

In [33]:
%%time
pred, true = predict(model_v2, valgen)

CPU times: user 42.5 ms, sys: 917 µs, total: 43.4 ms
Wall time: 46 ms


In [34]:
f1_score(pred, true)

0.9820788530465949

In [35]:
#torch.save(model_v2, 'fcnn_v2.h5')

In [36]:
sum(p.numel() for p in model_v2.parameters())

12051

| model | number of parameters | epochs | lr | f1_score |
| -------|---------|-----------|-------|-------------|
| `SimpleNN_v1` | 9401 | 30 | 0.001 | 0.961 |
| `SimpleNN_v2` | 12051 | 30 | 0.001 | 0.982 |


Computer parameters

| PU | name | number of cores |
|----|------|--------|
| CPU | Intel(R) Xeon(R) CPU @ 2.30GHz | 2 |
| GPU | Tesla T4 | 2560 (15079MiB) |