In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os import listdir
plt.rcParams['figure.figsize'] = (10, 3) # set default size of plots

In [2]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
# Read X and y
# X = ...
# y = ...

In [None]:
# Scale and Smote
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

# Sampling
sampler = SMOTE(ratio='minority')
X_sampled, y = sampler.fit_resample(X, y)
X = pd.DataFrame(X_sampled, columns = X.columns) #XGBoost algorithm raises error without that because RUS returns a numpy array with no column names

In [None]:
%%time
# Random Forest
n_estimators = [100, 250]
max_features = ['auto', 'sqrt']
max_depth = [3, 5,  9]
max_depth.append(None)
min_samples_split = [2, 3, 5]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
criterion = ['gini', 'entropy']
param_grid = {'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap,
            'criterion': criterion}

rf = RandomForestClassifier()
rf_grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs = 3, verbose=1, scoring = "f1")
rf_grid_search.fit(X, y)
print(rf_grid_search.best_params_)
print("Score: ", rf_grid_search.best_score_)

In [None]:
# XGBoost
grid_xgb = {'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2],
        'subsample': [0.7, 1.0],
        'colsample_bytree': [0.7, 1.0],
        'max_depth': [3, 5, 9]}

xgb = XGBClassifier()
xgb_grid_search = GridSearchCV(xgb, grid_xgb, cv=5, n_jobs= 3, verbose = 1, scoring = "f1")
xgb_grid_search.fit(X,y)
print("Score: ",xgb_grid_search.best_score_)
print(xgb_grid_search.best_params_)

In [None]:
# LGBM
lgbm = lgb.LGBMClassifier(silent=False)
param_grid = {"max_depth": [3, 5, 9,], "learning_rate" : [0.008, 0.01, 0.012], 
              "num_leaves": [80, 120], "n_estimators": [200, 250]}
lgbm_grid_search = GridSearchCV(lgbm, param_grid, cv=5, refit=True, n_jobs = 3, verbose=1,  scoring = "f1")
lgbm_grid_search.fit(X,y)
print(lgbm_grid_search.best_params_)
print("Score: ", lgbm_grid_search.best_score_)

In [None]:
# MLP
mlp =  MLPClassifier()
param_grid={
'learning_rate': ("constant", "invscaling", "adaptive"),
'hidden_layer_sizes': ((X.shape[1], X.shape[1]),
                       (X.shape[1]* 2, X.shape[1], X.shape[1] // 2),
                       (int(X.shape[1]*1.5)//1, X.shape[1]*2, int(X.shape[1]*1.5)//1)),
'alpha': (10.0 ** -np.arange(1, 7)),
'activation': [("relu")]}
mlp_grid_search = GridSearchCV(mlp, param_grid, cv=5, refit=True, n_jobs = 3, verbose=1,  scoring = "f1")
mlp_grid_search.fit(X,y)
print(mlp_grid_search.best_params_)
print("Score: ", mlp_grid_search.best_score_)

In [3]:
clf_rf = RandomForestClassifier(n_jobs = -1)# n_estimators = 250, min_samples_split = 2, min_samples_leaf = 2, max_features = 'sqrt', max_depth = 10, bootstrap = True)
clf_xgb = XGBClassifier(n_jobs = -1,probability = True) # colsample_bytree = 0.6, gamma=1, max_depth=9, min_child_weight=1,subsample=1, probability = True)
clf_lgbm = lgb.LGBMClassifier(n_jobs = -1,silent = True) # learning_rate= 0.012, max_depth = 10, n_estimators =  250, num_leaves= 80, silent=True, probability = True)
clf_mlp = MLPClassifier()

clf_rf.set_params(**rf_grid_search.best_params_)
clf_xgb.set_params(**xgb_grid_search.best_params_)
clf_lgbm.set_params(**lgbm_grid_search.best_params_)
clf_mlp.set_params(**mlp_grid_search.best_params_)

classifiers = [clf_rf, clf_xgb, clf_lgbm, clf_mlp]
clf_list=[]
for clf in classifiers:
    clf_list.append(clf.__class__.__name__)

In [40]:
%%time
setparams = True
clf_rf = RandomForestClassifier(n_estimators = 250, n_jobs = -1)

clf_xgb = XGBClassifier(nthread = -1)

clf_lgbm = lgb.LGBMClassifier(n_estimators = 250,
                              silent=True, probability = True)

clf_mlp = MLPClassifier(hidden_layer_sizes = (int((X.shape[1]* 1.5)//1), X.shape[1]*2, int((X.shape[1]*1.5)//1)),
                        activation = 'relu',)

# Set Hyperparameters
if setparams == True:
    clf_rf.set_params(**rf_grid_search.best_params_)
    clf_xgb.set_params(**xgb_grid_search.best_params_)
    clf_lgbm.set_params(**lgbm_grid_search.best_params_)
    clf_mlp.set_params(**mlp_grid_search.best_params_)

my_metrics = ['Accuracy_dict','BalancedAccuracy_dict', 'Recall_dict', 'Precision_dict', 'Fscore_dict']
scaler = StandardScaler()

skf = StratifiedKFold(n_splits = 5, random_state=None, shuffle=False)
Accuracy_dict = {}
BalancedAccuracy_dict = {}
Recall_dict = {}
Precision_dict = {}
Fscore_dict = {}
clf_list=[]
conf_rf = np.zeros([2,2])
conf_xgb = np.zeros([2,2])
conf_lgbm = np.zeros([2,2])
conf_mlp = np.zeros([2,2])
conf_eclf = np.zeros([2,2])

pred_dict = pd.DataFrame(columns = clf_list)

for clf in classifiers:
    clf_list.append(clf.__class__.__name__)

stacked_results = pd.DataFrame()
stacked_probabilities = pd.DataFrame()
final_ensembled_metrics = pd.Series(np.zeros([1,len(my_metrics)])[0])
final_probs = pd.DataFrame()
aggregated_y_test = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Scaling
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # PCA
    #pca = PCA(n_components = n_of_components)
    #pca.fit(X_train)
    #X_train = pca.transform(X_train)
    #X_test = pca.transform(X_test)

    # Sampling
    sampler = SMOTE(sampling_strategy = 'minority') # Synthetic Minority Over Sampling Technique
    X_train, y_train = sampler.fit_sample(X_train, y_train)

    a = pd.DataFrame()
    b = pd.DataFrame()
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_predictions = pd.Series(clf.predict(X_test))
        train_probabilities = pd.Series(clf.predict_proba(X_test)[:,1])
        Accuracy = metrics.accuracy_score(y_test, train_predictions)
        BalancedAccuracy = metrics.balanced_accuracy_score(y_test, train_predictions)
        Recall = metrics.recall_score(y_test, train_predictions)
        Precision = metrics.precision_score(y_test, train_predictions)
        Fscore = metrics.f1_score(y_test, train_predictions)

        if clf == clf_rf:
            conf_rf += metrics.confusion_matrix(y_test, train_predictions)
        if clf == clf_xgb:
            conf_xgb += metrics.confusion_matrix(y_test, train_predictions)
        if clf == clf_lgbm:
            conf_lgbm += metrics.confusion_matrix(y_test, train_predictions)
        if clf == clf_mlp:
            conf_mlp += metrics.confusion_matrix(y_test, train_predictions)


        for x in my_metrics:
            if name in eval(x):
                eval(x)[name] += eval(x.split("_")[0])
            else:
                eval(x)[name] = eval(x.split("_")[0])
        a = pd.concat([a, train_predictions], axis=1)
        b = pd.concat([b, train_probabilities], axis=1)
    
    probs = pd.DataFrame(b.mean(axis=1))
    
    final_probs = pd.concat([final_probs, b], axis = 0)
    aggregated_y_test = np.concatenate((aggregated_y_test, y_test.values))
    probs[probs[0] < 0.5] = 0
    probs[probs[0] >= 0.5] = 1
    ensembled_preds = probs
    conf_eclf += metrics.confusion_matrix(y_test, ensembled_preds)
    ensembled_metrics = []
    ensembled_metrics.append(metrics.accuracy_score(y_test, ensembled_preds))
    ensembled_metrics.append(metrics.balanced_accuracy_score(y_test, ensembled_preds))
    ensembled_metrics.append(metrics.recall_score(y_test, ensembled_preds))
    ensembled_metrics.append(metrics.precision_score(y_test, ensembled_preds))
    ensembled_metrics.append(metrics.f1_score(y_test, ensembled_preds))
    final_ensembled_metrics += pd.Series(ensembled_metrics)

    stacked_results = pd.concat([stacked_results,a], axis = 0)
stacked_results.columns = clf_list

final_probs['GT'] = aggregated_y_test
clf_list.append('GT')
final_probs.columns = clf_list
final_probs = final_probs.round(2)

metric_results = pd.DataFrame()
for x in my_metrics:
    metric_results = metric_results.append(pd.DataFrame(data = [list((eval(x).values()))] , columns = list((eval(x).keys()))))
metric_results.index = ["Accuracy", "Bal_Accuracy", "Recall", "Precision", "Fscore"]
metric_results['EnsembledClassifiers'] = list(final_ensembled_metrics)
metric_results = metric_results.T/5

Wall time: 3min 20s


In [43]:
metric_results

Unnamed: 0,Accuracy,Bal_Accuracy,Recall,Precision,Fscore
RandomForestClassifier,0.947719,0.526202,0.075486,0.100455,0.085632
XGBClassifier,0.886942,0.583175,0.258366,0.099792,0.140122
LGBMClassifier,0.954197,0.55926,0.136965,0.200442,0.161414
MLPClassifier,0.790421,0.564514,0.322957,0.071607,0.107795
EnsembledClassifiers,0.94361,0.5662,0.162646,0.162733,0.15968
