## Random Forest
Sample Strategy: SMOTE + Random undersampling

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, recall_score, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline, make_pipeline


In [2]:
X, y = make_classification(n_samples=12500, n_classes=2,n_features=10, weights=[0.99, 0.01], flip_y=0, random_state=2020)

## Splitting the dataset into the Training set and Test set


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020,stratify=y)

In [4]:
# Results from split
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


>Train: 0=9900, 1=100, Test: 0=2475, 1=25


## Creating CrossValidation Object

In [5]:
from sklearn.model_selection import StratifiedKFold 
kf = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)


## SMOTE Oversampling for the Minority Class + Random Undersampling for Majority C.

In [6]:
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# summarize class distribution
print(Counter(y_train))
# define sample strategy
oversample = SMOTE(sampling_strategy=0.1)
undersample = RandomUnderSampler(sampling_strategy=1)

# fit and apply the transform
X_sample, y_sample = oversample.fit_resample(X_train, y_train)
print(Counter(y_sample))

# fit and apply the transform
X_sample, y_sample = undersample.fit_resample(X_sample, y_sample)
print(Counter(y_sample))


Counter({0: 9900, 1: 100})
Counter({0: 9900, 1: 990})
Counter({0: 990, 1: 990})


## Pipeline: Oversample, then undersample, then model

In [10]:
from sklearn.ensemble import RandomForestClassifier
pipeline = make_pipeline(oversample, undersample, 
                              RandomForestClassifier(random_state=2020))

pipeline

Pipeline(steps=[('smote', SMOTE(sampling_strategy=0.1)),
                ('randomundersampler', RandomUnderSampler(sampling_strategy=1)),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=2020))])

In [11]:
cross_val_score(pipeline, X_train, y_train, scoring='recall', cv=kf)

array([0.8 , 0.95, 0.95, 0.95, 0.7 ])

## Setting up Grid Search

In [12]:
params = {
    'n_estimators': [10, 20, 50],
    'max_depth': [4, 6, 10, 12],
    'random_state': [2020]
}
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid = GridSearchCV(pipeline, param_grid=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2020, shuffle=True),
             estimator=Pipeline(steps=[('smote', SMOTE(sampling_strategy=0.1)),
                                       ('randomundersampler',
                                        RandomUnderSampler(sampling_strategy=1)),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=2020))]),
             param_grid={'randomforestclassifier__max_depth': [4, 6, 10, 12],
                         'randomforestclassifier__n_estimators': [10, 20, 50],
                         'randomforestclassifier__random_state': [2020]},
             return_train_score=True, scoring='recall')

In [13]:
grid.best_score_

0.9099999999999999

In [15]:
grid.best_params_

{'randomforestclassifier__max_depth': 6,
 'randomforestclassifier__n_estimators': 50,
 'randomforestclassifier__random_state': 2020}

## Recall - Test Set

In [18]:
y_pred = grid.best_estimator_.predict(X_test)
recall_score(y_test, y_pred)

0.92

## SVM
Sample Strategy: SMOTE + Tomek Links

## SMOTE + Tomek Link Combined Sampling

In [32]:
from collections import Counter
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
# summarize class distribution
print(Counter(y_train))
# define oversample strategy
sample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
# fit and apply the transform
X_sample, y_sample = sample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(y_sample))

Counter({0: 9900, 1: 100})
Counter({0: 9900, 1: 9900})


## Pipeline: Oversample, then undersample, then model

In [34]:
from sklearn.svm import SVC
pipeline = make_pipeline(sample, 
                              SVC(random_state=0))

pipeline

Pipeline(steps=[('smotetomek',
                 SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))),
                ('svc', SVC(random_state=0))])

In [35]:
params = {
    'kernel': ['linear', 'poly', 'rbf','sigmoid'],
    'C': [50, 10, 1.0, 0.1],
    'random_state': [2020]
}
new_params = {'svc__' + key: params[key] for key in params}
grid = GridSearchCV(pipeline, param_grid=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2020, shuffle=True),
             estimator=Pipeline(steps=[('smotetomek',
                                        SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))),
                                       ('svc', SVC(random_state=0))]),
             param_grid={'svc__C': [50, 10, 1.0, 0.1],
                         'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                         'svc__random_state': [2020]},
             return_train_score=True, scoring='recall')

In [36]:
grid.best_score_

0.89

In [37]:
grid.best_params_

{'svc__C': 50, 'svc__kernel': 'linear', 'svc__random_state': 2020}

# Recall - Test Set

In [38]:
y_pred = grid.best_estimator_.predict(X_test)
recall_score(y_test, y_pred)

0.88