# Part2. Model design 3

Data Augmentation - SMOTE 
(Synthetic Minority Oversampling Technique) to overcome the data imbalance

In [1]:
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tempfile

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.metrics import SpecificityAtSensitivity
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE

In [2]:
#load the original data
df1=pd.read_csv("../Data/kag_risk_factors_cervical_cancer.csv")

#load the processed datasets:
X_train=pd.read_csv("../Data/X_train_preprocessed.csv")
X_test=pd.read_csv("../Data/X_test_preprocessed.csv")
X_validate=pd.read_csv("../Data/X_validate_preprocessed.csv")
y_train=pd.read_csv("../Data/y_train_preprocessed.csv")
y_test=pd.read_csv("../Data/y_test_preprocessed.csv")
y_validate=pd.read_csv("../Data/y_validate_preprocessed.csv")

In [3]:
#convert df to np:
y_train_np=y_train.to_numpy()
X_train_np=X_train.to_numpy()

In [17]:
#using SMOTE create a balanced train dataset:
x_train_s, y_train_s = SMOTE(random_state=33).fit_resample(X_train_np, y_train_np.ravel())
print(sorted(Counter(y_train_s).items()))

# now I  have a training dataset perfectly balanced:

[(0, 615), (1, 615)]


In [18]:
print(x_train_s.shape, y_train_s.shape)


(1230, 31) (1230,)


Apply Decission Tree Classifier and Random Forest Classifier

In [5]:
l_final = [] #--> New list for storing metrics of base models

def models_dt(x,y,x_test,y_test):
    mod = {}
    model = DecisionTreeClassifier().fit(x,y)
    ypred = model.predict(x_test)
    mod['Model'] = 'Decision Tree After Sampling'
    mod['Train_Score'] = model.score(x_train_s,y_train_s)
    mod['Test_accuracy'] = metrics.accuracy_score(y_test,ypred)
    mod['f1score'] = metrics.f1_score(y_test,ypred)
    mod['recall'] = metrics.recall_score(y_test, ypred)
    mod['precision'] = metrics.precision_score(y_test, ypred)
    model.predict_proba(x_test)
    mod['roc_auc'] = metrics.roc_auc_score(y_test,ypred)
    return mod
l_final.append(models_dt(x_train_s,y_train_s,X_test,y_test))

def models_rf(x,y, x_test, y_test):
    mod = {}
    model = RandomForestClassifier().fit(x,y)
    ypred = model.predict(x_test)
    mod['Model'] = 'Random Forest After Sampling'
    mod['Train_Score'] = model.score(x_train_s,y_train_s)
    mod['Test_accuracy'] = metrics.accuracy_score(y_test,ypred)
    mod['f1score'] = metrics.f1_score(y_test,ypred)
    mod['recall'] = metrics.recall_score(y_test, ypred)
    mod['precision'] = metrics.precision_score(y_test, ypred)
    model.predict_proba(x_test)
    mod['roc_auc'] = metrics.roc_auc_score(y_test,ypred)
    return mod
l_final.append(models_rf(x_train_s,y_train_s, X_test, y_test))


In [6]:
final_model = pd.DataFrame(l_final)
final_model

Unnamed: 0,Model,Train_Score,Test_accuracy,f1score,recall,precision,roc_auc
0,Decision Tree After Sampling,1.0,0.948276,0.625,0.555556,0.714286,0.768432
1,Random Forest After Sampling,1.0,0.948276,0.666667,0.666667,0.666667,0.819315


Recall is 0.66 with Random Forest

### Use the neural network model from "cervix_project_2_Model_design_1"

In [19]:
model = Sequential() 

#Input layer
model.add(Dense(units=800, 
            input_dim=31, # i have 31 features
            kernel_initializer='uniform', # all features have the same weight
            activation='relu'
               ))
model.add(Dropout(0.5))  #randomly sets 0.5 units to 0. To prevent overfitting
#Hidden layer 1
model.add(Dense(units=800,  
                kernel_initializer='uniform', 
                activation='relu'))
model.add(Dropout(0.5))

#Output layer
model.add(Dense(units=1, #only one unit needed, it its either 1 or not 1
                kernel_initializer='uniform', 
                activation='sigmoid'))

#model(output_bias=initial_bias)

print(model.summary()) #for showing the structure and parameters

# Defining how to measure performance
model.compile(loss='binary_crossentropy',   
              optimizer='adam',
               metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),'accuracy'])


# Train the model
# Verbose=2, showing loss and accuracy change timely
#remove batch_size, the dataset is small. -> it increases recall.
train_history = model.fit(x=x_train_s, y=y_train_s,  
                          validation_split=0.2, epochs=30, verbose=2) 



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 800)               25600     
_________________________________________________________________
dropout_2 (Dropout)          (None, 800)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 800)               640800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 800)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 801       
Total params: 667,201
Trainable params: 667,201
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/30
31/31 - 1s - loss: 0.4135 - precision_1: 0.7958 - recall_1: 0.7182 - accuracy: 0.8252 - val_loss: 

In [20]:
scores = model.evaluate(X_test, y_test)
print('\n')






Recall is worse than with Random forest classifier (0.55 vs 0.66)

We could also use a fore strict feature selection to improve the recall, but porbably because of the small sample size and the imbalanced dataset these numbers cannot be improved much more