## Baseline Models (No Sampling) - Fitting the Train Data using Cross Validation

In [193]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, recall_score, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score


In [194]:
X, y = make_classification(n_samples=12500, n_classes=2,n_features=10, weights=[0.99, 0.01], flip_y=0, random_state=2020)

## Splitting the dataset into the Training set and Test set


In [195]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020,stratify=y)

In [196]:
# Results from split
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


>Train: 0=9900, 1=100, Test: 0=2475, 1=25


## Creating CrossValidation Object

In [197]:
from sklearn.model_selection import StratifiedKFold 
kf = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)

## Training the Logistic Regression model on the Training set

In [198]:
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression(random_state = 2020)
lm.fit(X_train, y_train)

LogisticRegression(random_state=2020)

## Confusion Matrix

In [199]:
y_pred = lm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[2474    1]
 [  13   12]]


In [200]:
# Applying k-Fold Cross Validation
score = cross_val_score(lm, X_train, y_train, cv=kf, scoring='recall')
print('>Mean Recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))


>Mean Recall: 49.0, std: 8.0


## Was K-fold accurate?

In [201]:
for train_ix, test_ix in kf.split(X_train,y_train):
# select rows
    train_X, test_X = X_train[train_ix], X_train[test_ix]
    train_y, test_y = y_train[train_ix], y_train[test_ix]
# summarize train and test composition
    train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
    test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
    print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


>Train: 0=7920, 1=80, Test: 0=1980, 1=20
>Train: 0=7920, 1=80, Test: 0=1980, 1=20
>Train: 0=7920, 1=80, Test: 0=1980, 1=20
>Train: 0=7920, 1=80, Test: 0=1980, 1=20
>Train: 0=7920, 1=80, Test: 0=1980, 1=20


## Training the SVM model on the Training set


In [202]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

## Confusion Matrix

In [203]:
y_pred = svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[2474    1]
 [  16    9]]


In [204]:
# Applying k-Fold Cross Validation
score = cross_val_score(svm, X_train, y_train, cv=kf, scoring='recall')
print('>Mean Recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))

>Mean Recall: 43.0, std: 6.8


## Training the Random Forest model on the Training set


In [205]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10
                            , criterion= 'entropy'
                            , random_state = 2020)
rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=2020)

## Confusion Matrix

In [206]:
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[2474    1]
 [  12   13]]


In [207]:
# Applying k-Fold Cross Validation
score = cross_val_score(rf, X_train, y_train, cv=kf, scoring='recall')
print('>Mean Recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))

>Mean Recall: 63.0, std: 8.7


## Training the ANN model on the Training set

In [208]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

# Initialising the ANN
ann = Sequential()

# Adding the input layer and the first hidden layer
ann.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 10))

# Adding the second hidden layer
ann.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
ann.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
ann.fit(X_train, y_train, batch_size = 10, epochs = 10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7ff029b76510>

## Confusion Matrix

In [209]:
# Part 3 - Making the predictions and evaluating the model
# Predicting the Test set results
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[2473    2]
 [  12   13]]


In [210]:
#Applying Kfold CrossValidation
# Evaluating the ANN
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense

def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 10))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 10)

score = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv = kf, n_jobs = -1,scoring='recall')
print('>Mean Recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))

>Mean Recall: 45.0, std: 11.4
