In [68]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, recall_score, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline, make_pipeline


In [69]:
X, y = make_classification(n_samples=12500, n_classes=2,n_features=10, weights=[0.99, 0.01], flip_y=0, random_state=2020)

## Splitting the dataset into the Training set and Test set


In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020,stratify=y)

In [71]:
# Results from split
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


>Train: 0=9900, 1=100, Test: 0=2475, 1=25


## Creating CrossValidation Object

In [72]:
from sklearn.model_selection import StratifiedKFold 
kf = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)

## SMOTE + Tomek Link Combined Sampling

In [74]:
from collections import Counter
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
# summarize class distribution
print(Counter(y_train))
# define oversample strategy
sample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
# fit and apply the transform
X_sample, y_sample = sample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(y_sample))

Counter({0: 9900, 1: 100})
Counter({0: 9900, 1: 9900})


## Training the Logistic Regression model on the Training set

In [75]:
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression(random_state = 2020)
lm.fit(X_sample, y_sample)

LogisticRegression(random_state=2020)

## Confusion Matrix

In [76]:
y_pred = lm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[2305  170]
 [   3   22]]


In [77]:
#Warning: Do not sample the test set on cross validation
pipeline = make_pipeline(sample, 
                              lm)
# Applying k-Fold Cross Validation
score = cross_val_score(pipeline, X_train, y_train, scoring='recall', cv=kf)
print('>Mean recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))

>Mean recall: 87.0, std: 7.5


## What happens if we sample BEFORE Cross-validation?
Biased or over optimistic results

In [78]:
# This is a mistake
score = cross_val_score(lm, X_sample, y_sample, cv=kf, scoring='recall')
print('>Mean recall - WARNING: %.1f, std - WARNING: %.1f' % (score.mean()*100, score.std()*100))




## Training the SVM model on the Training set


In [79]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_sample, y_sample)

SVC(kernel='linear', random_state=0)

## Confusion Matrix

In [80]:
y_pred = svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[2306  169]
 [   3   22]]


In [81]:
#Warning: Do not sample the test set on cross validation
pipeline = make_pipeline(sample, 
                              svm)
# Applying k-Fold Cross Validation
score = cross_val_score(pipeline, X_train, y_train, scoring='recall', cv=kf)
print('>Mean recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))

>Mean recall: 89.0, std: 8.0


## Training the Random Forest model on the Training set


In [82]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 2020)
rf.fit(X_sample, y_sample)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=2020)

## Get F1 score and Confusion Matrix

In [83]:
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[2454   21]
 [   8   17]]


In [84]:
#Warning: Do not sample the test set on cross validation
pipeline = make_pipeline(sample, 
                              rf)
# Applying k-Fold Cross Validation
score = cross_val_score(pipeline, X_train, y_train, scoring='recall', cv=kf)
print('>Mean recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))

>Mean recall: 82.0, std: 7.5


## Training the ANN model on the Training set

In [85]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

# Initialising the ANN
ann = Sequential()

# Adding the input layer and the first hidden layer
ann.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 10))

# Adding the second hidden layer
ann.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
ann.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
ann.fit(X_sample, y_sample, batch_size = 10, epochs = 10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7ff5148e3a10>

## Get F1 score and Confusion Matrix

In [86]:
# Part 3 - Making the predictions and evaluating the model
# Predicting the Test set results
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[2382   93]
 [   8   17]]


In [87]:
#Applying Kfold CrossValidation
# Evaluating the ANN
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense

def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 10))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 10)

#Warning: Do not sample the test set on cross validation
pipeline = make_pipeline(sample, 
                              classifier)

score = cross_val_score(pipeline, X = X_train, y = y_train, cv = kf, n_jobs = -1,scoring='recall')
print('>Mean recall: %.1f, std: %.1f' % (score.mean()*100, score.std()*100))

>Mean recall: 78.0, std: 5.1
