In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
df_final = pd.read_csv('/item+cash_1931_label.csv')
df_f_x = df_final.drop('label', axis = 1)
df_f_y = df_final['label']

x_f_train, x_f_test, y_f_train, y_f_test = train_test_split(df_f_x, df_f_y, test_size = 0.3, stratify = df_f_y, random_state = 1)

In [3]:
df_res = pd.read_csv('/item+cash_1931_resampled.csv')
df_res_x = df_res.drop('y', axis = 1)
df_res_y = df_res['y']

x_df_train, x_df_test, y_df_train, y_df_test = train_test_split(df_res_x, df_res_y, test_size = 0.3, stratify = df_res_y,
                                                                random_state = 1)

In [4]:
def ET_final_pred(x_train, y_train, x_test, params, threshold):
    ET_final = ExtraTreesClassifier(**params)
    ET_final.fit(x_train, y_train)
    y_pred_test_prob = ET_final.predict_proba(x_test)[:,1]
    y_pred_test = (y_pred_test_prob >= threshold).astype(int)
    
    return y_pred_test

def evaluation(y_test, y_pred):
    f1 = f1_score(y_test, y_pred)
    print('F1-score of ET_1: ', round(f1,4))
    print('Accuracy of ET_1 using TPE: ', round(accuracy_score(y_test, y_pred), 4))
    print('Recall of ET_1 using TPE: ', round(recall_score(y_test, y_pred), 4))
    print('Precision of ET_1 using TPE: ', round(precision_score(y_test, y_pred), 4))
    print('Confusion matrix of ET_1 using TPE: \n',confusion_matrix(y_test, y_pred))

In [7]:
param_space = {
    'max_depth':hp.quniform('max_depth', 10,1000,50),
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 2, 100, 5),
    'n_estimators': hp.quniform('n_estimators', 1000, 2000, 50),
    'min_samples_leaf':hp.quniform('min_samples_leaf',2,10,1),
    'min_samples_split':hp.quniform('min_samples_split',2,10,1)
    
}


# k-fold cv
k_fold = 5
kf = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=100)

def ET_f(params):
    f1_scores_cv = []
    
    for train_idx, val_idx in kf.split(x_df_train, y_df_train):
        x_train, x_valid = x_df_train.iloc[train_idx], x_df_train.iloc[val_idx]
        y_train, y_valid = y_df_train.iloc[train_idx], y_df_train.iloc[val_idx]
        
        ET_1 = ExtraTreesClassifier(
            max_depth = int(params['max_depth']),
            max_leaf_nodes = int(params['max_leaf_nodes']),
            n_estimators = int(params['n_estimators']),
            min_samples_leaf = int(params['min_samples_leaf']),
            min_samples_split = int(params['min_samples_split']),
            random_state = 100,
            n_jobs = -1
        )
        
        # Training
        ET_1.fit(x_train, y_train)
        y_pred_1 = ET_1.predict(x_valid)
        f1_cv = f1_score(y_valid, y_pred_1)
        f1_scores_cv.append(f1_cv)
    f1_mean = np.mean(f1_scores_cv)
    
    return{'loss':-f1_mean, 'status':STATUS_OK}


trials = Trials()

In [8]:
param_1_tpe = fmin(fn = ET_f, space = param_space,
                   algo = tpe.suggest,
                   max_evals = 10,
                   trials = trials)

100%|████████████████████████████████████████████| 10/10 [2:10:47<00:00, 784.77s/trial, best loss: -0.7586652630703319]


In [10]:
param_1 = {'max_depth': 700, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 1100}
y_pred_val_1 = ET_final_pred(x_df_train, y_df_train, x_df_test, param_1, 0.5)
evaluation(y_df_test, y_pred_val_1)

F1-score of ET_1 using TPE:  0.7573
Accuracy of ET_1 using TPE:  0.7461
Recall of ET_1 using TPE:  0.793
Precision of ET_1 using TPE:  0.7246
Confusion matrix of ET_1 using TPE: 
 [[26169 11255]
 [ 7729 29616]]


In [17]:
param_1 = {'max_depth': 700, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 1100}
y_pred_f_1 = ET_final_pred(x_f_train, y_f_train, x_f_test, param_1, 0.5)
evaluation(y_f_test, y_pred_f_1)

F1-score of ET_1 using TPE:  0.5549
Accuracy of ET_1 using TPE:  0.9029
Recall of ET_1 using TPE:  0.437
Precision of ET_1 using TPE:  0.76
Confusion matrix of ET_1 using TPE: 
 [[52368  1189]
 [ 4850  3765]]
