In [1]:
import os
import pandas as pd
import numpy as np
import yaml
import pickle

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


from utils.training_utils import find_specific_variables
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(os.path.join('..', 'data', 'train_test', 'train_encoded.csv'))

print(df.shape)
df.head()

(32940, 19)


Unnamed: 0,contact,default,employment_status,job,month,poutcome,quarter,age,campaign,cons.conf.idx,cons.price.idx,contacts_tendency,emp.var.rate,euribor3m,nr.employed,pdays,previous,was_contacted_before,y
0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,31.0,3.0,-29.8,92.379,0.0,-3.4,0.803,5017.5,999.0,0.0,0.0,0
1,1.0,0.0,0.0,3.0,6.0,1.0,1.0,39.0,2.0,-36.4,93.994,0.0,1.1,4.857,5191.0,999.0,0.0,0.0,0
2,0.0,0.0,0.0,2.0,3.0,1.0,2.0,34.0,4.0,-42.7,93.918,0.0,1.4,4.958,5228.1,999.0,0.0,0.0,0
3,1.0,1.0,0.0,9.0,6.0,1.0,1.0,36.0,9.0,-36.4,93.994,0.0,1.1,4.856,5191.0,999.0,0.0,0.0,0
4,0.0,1.0,1.0,8.0,1.0,1.0,2.0,25.0,1.0,-31.4,92.201,0.0,-2.9,0.825,5076.2,999.0,0.0,0.0,0


In [3]:
features = yaml.safe_load(open(os.path.join('..', 'src', 'config', 'feature_config.yaml'), 'r'))
feature_target = find_specific_variables(features, 'target', specific_value=True)

In [4]:
seletor = pickle.load(
    open(os.path.join('..', 'models', 'encoders', 'seletor_2.pkl'), 'rb')
)

seletor.features

['age',
 'campaign',
 'cons.conf.idx',
 'cons.price.idx',
 'contact',
 'contacts_tendency',
 'default',
 'emp.var.rate',
 'employment_status',
 'euribor3m',
 'job',
 'month',
 'nr.employed',
 'pdays',
 'poutcome',
 'previous',
 'quarter',
 'was_contacted_before']

In [5]:
scale_pos_weight = df[df[feature_target]==0].shape[0] / df[df[feature_target]==1].shape[0]

models = {
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(),
    'GBT': GradientBoostingClassifier(),
    'ADA': AdaBoostClassifier(),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LGBM': LGBMClassifier(),

    'DT_bal': DecisionTreeClassifier(class_weight='balanced'),
    'RF_bal': RandomForestClassifier(class_weight='balanced'),
    'GBT_bal': GradientBoostingClassifier(),
    'ADA_bal': AdaBoostClassifier(),
    'XGB_bal': XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss'),
    'LGBM_bal': LGBMClassifier(class_weight='balanced'),
}

In [6]:
sampling_strategies = {
    'Original': None,
    'Undersampling': RandomUnderSampler(random_state=96),
    'Oversampling': RandomOverSampler(random_state=96),
    'SMOTE': SMOTE(random_state=96)
}

results_sampling = {}

for sampling_name, sampler in sampling_strategies.items():
    print(f'\nSampling Strategy: {sampling_name}')
    results_sampling[sampling_name] = {}
    
    for model_name, model in models.items():
        skf = StratifiedKFold(n_splits=5, random_state=96, shuffle=True)
        
        if sampler is not None:
            pipeline = Pipeline([
                ('sampler', sampler),
                ('classifier', model)
            ])
        else:
            pipeline = model

        scores = cross_val_score(
            pipeline,
            df[seletor.features],
            df[feature_target],
            cv=skf,
            scoring='roc_auc'
        )

        results_sampling[sampling_name][model_name] = scores

        print(f'{model_name}: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')



Sampling Strategy: Original
DT: 0.6281 +/- 0.0108
RF: 0.7591 +/- 0.0090
GBT: 0.7990 +/- 0.0100
ADA: 0.7947 +/- 0.0059
XGB: 0.7910 +/- 0.0051
LGBM: 0.8031 +/- 0.0066
DT_bal: 0.6237 +/- 0.0106
RF_bal: 0.7501 +/- 0.0070
GBT_bal: 0.7991 +/- 0.0100
ADA_bal: 0.7947 +/- 0.0059
XGB_bal: 0.7910 +/- 0.0051
LGBM_bal: 0.8003 +/- 0.0064

Sampling Strategy: Undersampling
DT: 0.6434 +/- 0.0068
RF: 0.7696 +/- 0.0095
GBT: 0.8003 +/- 0.0099
ADA: 0.7911 +/- 0.0078
XGB: 0.7848 +/- 0.0119
LGBM: 0.7974 +/- 0.0090
DT_bal: 0.6457 +/- 0.0056
RF_bal: 0.7691 +/- 0.0093
GBT_bal: 0.8002 +/- 0.0098
ADA_bal: 0.7911 +/- 0.0078
XGB_bal: 0.7848 +/- 0.0119
LGBM_bal: 0.7974 +/- 0.0090

Sampling Strategy: Oversampling
DT: 0.6180 +/- 0.0080
RF: 0.7480 +/- 0.0071
GBT: 0.8034 +/- 0.0084
ADA: 0.7962 +/- 0.0063
XGB: 0.7711 +/- 0.0062
LGBM: 0.8013 +/- 0.0053
DT_bal: 0.6166 +/- 0.0085
RF_bal: 0.7479 +/- 0.0097
GBT_bal: 0.8036 +/- 0.0080
ADA_bal: 0.7962 +/- 0.0063
XGB_bal: 0.7711 +/- 0.0062
LGBM_bal: 0.8013 +/- 0.0053

Sampling 