In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_train_oversampled = pd.read_csv('X_train_oversampled.csv')
y_train_oversampled = pd.read_csv('y_train_oversampled.csv')
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

# Random Forest

## 1) Standarization

In [2]:
sc = StandardScaler()
train_features = sc.fit_transform(X_train_oversampled)
train_labels = y_train_oversampled
test_features = sc.transform(X_test)
test_labels = y_test

## 2) HiperParameters Otimization

In [3]:
rf = RandomForestClassifier(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [4]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [5]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,scoring='recall', param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

print("*******************************************")
print("RESULTS")
print("*******************************************")
print("Best Parameters:")
print(rf_random.best_params_)
print("\n*******************************************")
print("Best Estimator:")
print(rf_random.best_estimator_)
print("\n*******************************************")
print("Best Score:")
print(rf_random.best_score_)
print("\n*******************************************")


Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


*******************************************
RESULTS
*******************************************
Best Parameters:
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}

*******************************************
Best Estimator:
RandomForestClassifier(max_depth=30, max_features='sqrt', min_samples_split=5,
                       n_estimators=400)

*******************************************
Best Score:
1.0

*******************************************


## 3) Results

In [6]:
best_random = rf_random.best_estimator_
y_pred = best_random.predict(test_features)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

[[1679   16]
 [  90    4]]
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1695
           1       0.20      0.04      0.07        94

    accuracy                           0.94      1789
   macro avg       0.57      0.52      0.52      1789
weighted avg       0.91      0.94      0.92      1789

0.9407490217998882
0.07017543859649122
0.5165568317328815


# Balanced Random Forest

## 1) Standarization

In [7]:
sc = StandardScaler()
train_features = sc.fit_transform(X_train)
train_labels = y_train
test_features = sc.transform(X_test)
test_labels = y_test

## 2) HiperParameters Otimization

In [8]:
rf = BalancedRandomForestClassifier(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'replacement': False,
 'sampling_strategy': 'auto',
 'verbose': 0,
 'warm_start': False}


In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 3000, num = 1000)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200,
                  202,
                  205,
                  208,
                  211,
                  214,
                  216,
                  219,
                  222,
                  225,
                  228,
                  230,
                  233,
                  236,
                  239,
                  242,
                  244,
                  247,
                  250,
                  253,
                  256,
                  258,
                  261,
                  264,
                  267,
                  270,
                  272,
                  275,
                  278,
                  281,
                  284,
                  286,
                  289,
                  292,
                  29

In [10]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = BalancedRandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,scoring='recall', param_distributions = random_grid, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

print("*******************************************")
print("RESULTS")
print("*******************************************")
print("Best Parameters:")
print(rf_random.best_params_)
print("\n*******************************************")
print("Best Estimator:")
print(rf_random.best_estimator_)
print("\n*******************************************")
print("Best Score:")
print(rf_random.best_score_)
print("\n*******************************************")


Fitting 3 folds for each of 300 candidates, totalling 900 fits


  self.best_estimator_.fit(X, y, **fit_params)


*******************************************
RESULTS
*******************************************
Best Parameters:
{'n_estimators': 2865, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': False}

*******************************************
Best Estimator:
BalancedRandomForestClassifier(bootstrap=False, max_depth=30,
                               min_samples_leaf=2, min_samples_split=5,
                               n_estimators=2865)

*******************************************
Best Score:
0.8969331322272498

*******************************************


## 3) Results

In [11]:
best_random = rf_random.best_estimator_
y_pred = best_random.predict(test_features)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

[[1127  568]
 [  12   82]]
              precision    recall  f1-score   support

           0       0.99      0.66      0.80      1695
           1       0.13      0.87      0.22        94

    accuracy                           0.68      1789
   macro avg       0.56      0.77      0.51      1789
weighted avg       0.94      0.68      0.77      1789

0.6757965343767468
0.2204301075268817
0.7686185903470784


# KNN

## 1) Standarization

In [12]:
sc = StandardScaler()
train_features = sc.fit_transform(X_train_oversampled)
train_labels = y_train_oversampled
test_features = sc.transform(X_test)
test_labels = y_test

## 2) HiperParameters Otimization

In [13]:
knn = KNeighborsClassifier()
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(knn.get_params())

Parameters currently in use:

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}


In [14]:
# Number Neighbors
n_neighbors = [int(x) for x in np.linspace(start = 2, stop = 250, num = 249)]

random_grid = {'n_neighbors': n_neighbors}
pprint(random_grid)

{'n_neighbors': [2,
                 3,
                 4,
                 5,
                 6,
                 7,
                 8,
                 9,
                 10,
                 11,
                 12,
                 13,
                 14,
                 15,
                 16,
                 17,
                 18,
                 19,
                 20,
                 21,
                 22,
                 23,
                 24,
                 25,
                 26,
                 27,
                 28,
                 29,
                 30,
                 31,
                 32,
                 33,
                 34,
                 35,
                 36,
                 37,
                 38,
                 39,
                 40,
                 41,
                 42,
                 43,
                 44,
                 45,
                 46,
                 47,
                 48,
                 49,


In [15]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
knn = KNeighborsClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
knn_random = RandomizedSearchCV(estimator = knn,scoring='accuracy', param_distributions = random_grid, n_iter = 249, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
knn_random.fit(train_features, train_labels)

print("*******************************************")
print("RESULTS")
print("*******************************************")
print("Best Parameters:")
print(knn_random.best_params_)
print("\n*******************************************")
print("Best Estimator:")
print(knn_random.best_estimator_)
print("\n*******************************************")
print("Best Score:")
print(knn_random.best_score_)
print("\n*******************************************")


Fitting 3 folds for each of 249 candidates, totalling 747 fits
*******************************************
RESULTS
*******************************************
Best Parameters:
{'n_neighbors': 2}

*******************************************
Best Estimator:
KNeighborsClassifier(n_neighbors=2)

*******************************************
Best Score:
0.9696776757270088

*******************************************


  return self._fit(X, y)


## 3) Results

In [16]:
best_random = knn_random.best_estimator_
y_pred = best_random.predict(test_features)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

[[1623   72]
 [  84   10]]
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1695
           1       0.12      0.11      0.11        94

    accuracy                           0.91      1789
   macro avg       0.54      0.53      0.53      1789
weighted avg       0.91      0.91      0.91      1789

0.912800447177194
0.11363636363636365
0.5319525513086047
