In [1]:
%%file  nn_module.py
import pandas as pd 
# Neural Network
import keras 
from keras.models import Sequential 
from keras.layers import Dense

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import random

random.seed(12)

titanic_url = "https://raw.githubusercontent.com/mcelikkaya/medium_articles4/main/titanic.csv"

df_orig = pd.read_csv(titanic_url)
df_orig = df_orig.drop(labels=['Age',"Cabin"], axis=1)

#Do label encoding for low cardinality columns
#Drop non necessary , high cardinality  columns
def get_df_labelencode(df_original):
  encode_labels = [  'Sex', 'Embarked']
  drop_labels =['Name', 'Ticket','SibSp',	'Parch']
  minmax_labels = [  'Fare']

  df = df_orig.drop(labels=drop_labels, axis=1)

  label_procecssor = {}
  for encode_label in encode_labels:
    le = LabelEncoder()
    df[encode_label] = le.fit_transform(df[encode_label].values)
    label_procecssor[encode_label] = le
  for minmax_label in minmax_labels:
    mm = MinMaxScaler()
    df[minmax_label] = mm.fit_transform(df[minmax_label].values.reshape(-1,1))
    label_procecssor[minmax_label] = mm
  return df, label_procecssor

def load_df():
  df,_ = get_df_labelencode(df_orig)
  return df

#Get splitted train, test sets
def get_train_test():
  df = load_df()
  X = df.drop(['Survived','PassengerId'], axis=1).values
  y = df['Survived'].values
  X_train, X_test, y_train, y_test = train_test_split(     X, y, test_size=0.2, random_state=42)
  
  return X_train, X_test, y_train, y_test

#get a model in varying structures, 
#we build network dynamicalla to support different depths and widths
def get_model(inputdim,levels=[80,50,10]):  
  model = Sequential()

  # layers
  model.add(Dense(levels[0],  activation = 'relu', input_dim = inputdim) )
  for level in levels[1:]:    
    model.add(Dense(level,  activation = 'relu'))
  
  model.add(Dense(1,  activation = 'sigmoid'))
  # summary
  model.summary()
  # Compiling the NN
  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return model  

Overwriting nn_module.py


In [2]:
from nn_module import *

In [3]:
df_orig.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,S


In [4]:
df_orig.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [5]:

X_train, X_test, y_train, y_test = get_train_test()
print(X_train.shape[1])
model = get_model(X_train.shape[1])
# Train the NN
model.fit(X_train, y_train, batch_size = 64, epochs = 100)

4
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                400       
                                                                 
 dense_1 (Dense)             (None, 50)                4050      
                                                                 
 dense_2 (Dense)             (None, 10)                510       
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 4,971
Trainable params: 4,971
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/10

<keras.callbacks.History at 0x7f105fd09f70>

In [6]:
%%file  nn_tests.py

from nn_module import *
import pytest
import random
import pandas as pd

@pytest.fixture
def dummy_dataset():
    #For simplicity I use already prepared set
    X_train, X_test, y_train, y_test = get_train_test()
    return X_train, X_test, y_train, y_test

@pytest.fixture
def dummy_nn_model(dummy_dataset):
    X_train, X_test, y_train, y_test = dummy_dataset
    model_ = get_model(X_train.shape[1])
    model_.fit(X_train, y_train,batch_size = 32, epochs = 100)
    return model_

import numpy as np 

#If our model will work for whole dataset
#it must work perfect with a little dataset.Success for a very small set must be so high
def test_dt_overfit(dummy_dataset):
    X_train, X_test, y_train, y_test = dummy_dataset
    #Best way is making totally random, 
    #sampled_list = random.sample(range(0,len(X_train)), 10)    
    #X_train,y_train =  X_train[sampled_list],y_train[sampled_list]

    #Get first 10 rows
    X_train,y_train =  X_train[0:10],y_train[0:10]
    
    overfit_model = get_model(X_train.shape[1])
    overfit_model.fit(X_train,y_train, epochs=20)
    pred = np.round(overfit_model.predict(X_train))

    labels = y_train.flatten().astype(int).tolist()
    pred = pred.flatten().astype(int).tolist()    

    actual_sum = np.sum(labels)
    pred_sum =  np.sum(pred)
    
    #print( np.logical_or(labels ,pred   ) )
    error = sum( np.logical_xor(labels ,pred   )  )
    is_below = error <= ( len(labels) / 5 )
    assert  is_below, "Model should fit data perfectly "
    

#We must put our domain knowledge,and test very simple relations.
#Here we know for this dataset "Class" field is important, so we try manually
#to see the effect we expect
def test_common_sense(dummy_dataset,dummy_nn_model):
  X_train, X_test, y_train, y_test = dummy_dataset
  p1 = dummy_nn_model.predict(X_train[1].reshape(1,-1) ).flatten()[0] 
  #copy ans change class 2nd
  X_train_copy = X_train[1].copy()
  X_train_copy[1] = 2.0
  p2 = dummy_nn_model.predict(X_train_copy.reshape(1,-1) ).flatten()[0] 
  #copy ans change class 3rd
  X_train_copy[1] = 3.0
  p3 = dummy_nn_model.predict(X_train_copy.reshape(1,-1) ).flatten()[0] 
  print( f"1st class {p1} 2nd class {p2} 3rd {p3} ")
  assert p1 > p2 , "1st class survive probability must be higher than 2nd class"
  assert p2 > p3 , "2nd class survive probability must be higher than 3rd class"


from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

#After finetuning we decided on a model architecture,
#we still want to check beginning from a very simple architecture
#to see if the number of neurons, we choose before makes sense aganist so dummy architectures
def test_width_acc(dummy_dataset):
    
    X_train, X_test, y_train, y_test = dummy_dataset
    levels_list = [  [4,3,2] , [20,10,5], [80,50,10] ] 

    acc_list = []    
    for levels in levels_list:
        model_ = get_model(X_train.shape[1] ,levels )
        model_.fit(X_train, y_train,batch_size =64, epochs = 40,verbose=0)
        pred_binary = np.round( model_.predict(X_train) )
        acc_list.append(accuracy_score(y_train, pred_binary))        
    
    assert sorted(acc_list) == acc_list, 'Accuracy should increase as comlexity increases.'
    

#After finetuning we decided on a model architecture,
#we still want to check beginning from a very simple architecture
#to see if the depth we choose before makes sense aganist so dummy architectures
def test_depth_acc(dummy_dataset):
    
    X_train, X_test, y_train, y_test = dummy_dataset
    levels_list = [  [4,3,] , [20,10,5,], [80,50,10,5] ] 

    acc_list = []
    for levels in levels_list:
        model_ = get_model(X_train.shape[1] ,levels )
        model_.fit(X_train, y_train,batch_size =64, epochs = 40,verbose=0)
        pred_binary = np.round( model_.predict(X_train) )
        acc_list.append(accuracy_score(y_train, pred_binary))        
                
    assert sorted(acc_list) == acc_list, 'Accuracy should increase as comlexity increases.'  


  
#test if we are still performing good as before
def test_dt_evaluation(dummy_dataset,dummy_nn_model):
    X_train, X_test, y_train, y_test = dummy_dataset
    
    pred_test = dummy_nn_model.predict(X_test)
    pred_test_binary = np.round(pred_test)
    acc_test = accuracy_score(y_test, pred_test_binary)
    auc_test = roc_auc_score(y_test, pred_test)    
    assert acc_test > 0.78, 'Accuracy on test should be > 0.78'
    

Overwriting nn_tests.py


In [7]:
!python -m pytest --verbose nn_tests.py

platform linux -- Python 3.8.10, pytest-3.6.4, py-1.11.0, pluggy-0.7.1 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content, inifile:
plugins: typeguard-2.7.1
collected 5 items                                                              [0m

nn_tests.py::test_dt_overfit [32mPASSED[0m[36m                                      [ 20%][0m
nn_tests.py::test_common_sense [32mPASSED[0m[36m                                    [ 40%][0m
nn_tests.py::test_width_acc [32mPASSED[0m[36m                                       [ 60%][0m
nn_tests.py::test_depth_acc [32mPASSED[0m[36m                                       [ 80%][0m
nn_tests.py::test_dt_evaluation [32mPASSED[0m[36m                                   [100%][0m

