In [1]:
import os
import pandas as pd
import numpy as np
import yaml
import pickle

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


from utils.training_utils import find_specific_variables
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(os.path.join('..', 'data', 'train_test', 'train_encoded.csv'))

print(df.shape)
df.head()

(32940, 19)


Unnamed: 0,contact,default,education,job,month,poutcome,quarter,age,campaign,cons.conf.idx,cons.price.idx,contacts_tendency,emp.var.rate,euribor3m,nr.employed,pdays,previous,was_contacted_before,y
0,0.0,0.0,6.0,0.0,9.0,1.0,2.0,31.0,3.0,-29.8,92.379,0.0,-3.4,0.803,5017.5,999.0,0.0,0.0,0
1,1.0,0.0,3.0,3.0,6.0,1.0,1.0,39.0,2.0,-36.4,93.994,0.0,1.1,4.857,5191.0,999.0,0.0,0.0,0
2,0.0,0.0,5.0,2.0,3.0,1.0,2.0,34.0,4.0,-42.7,93.918,0.0,1.4,4.958,5228.1,999.0,0.0,0.0,0
3,1.0,1.0,2.0,9.0,6.0,1.0,1.0,36.0,9.0,-36.4,93.994,0.0,1.1,4.856,5191.0,999.0,0.0,0.0,0
4,0.0,1.0,7.0,8.0,1.0,1.0,2.0,25.0,1.0,-31.4,92.201,0.0,-2.9,0.825,5076.2,999.0,0.0,0.0,0


In [3]:
features = yaml.safe_load(open(os.path.join('..', 'src', 'config', 'feature_config.yaml'), 'r'))
feature_target = find_specific_variables(features, 'target', specific_value=True)

In [4]:
seletor = pickle.load(
    open(os.path.join('..', 'models', 'encoders', 'seletor_2.pkl'), 'rb')
)

seletor.features

['age',
 'campaign',
 'cons.conf.idx',
 'cons.price.idx',
 'contact',
 'contacts_tendency',
 'default',
 'education',
 'emp.var.rate',
 'euribor3m',
 'job',
 'month',
 'nr.employed',
 'pdays',
 'poutcome',
 'previous',
 'quarter',
 'was_contacted_before']

In [5]:
scale_pos_weight = df[df[feature_target]==0].shape[0] / df[df[feature_target]==1].shape[0]

models = {
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(),
    'GBT': GradientBoostingClassifier(),
    'ADA': AdaBoostClassifier(),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LGBM': LGBMClassifier(),

    'DT_bal': DecisionTreeClassifier(class_weight='balanced'),
    'RF_bal': RandomForestClassifier(class_weight='balanced'),
    'GBT_bal': GradientBoostingClassifier(),
    'ADA_bal': AdaBoostClassifier(),
    'XGB_bal': XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss'),
    'LGBM_bal': LGBMClassifier(class_weight='balanced'),
}

In [6]:
sampling_strategies = {
    'Original': None,
    'Undersampling': RandomUnderSampler(random_state=96),
    'Oversampling': RandomOverSampler(random_state=96),
    'SMOTE': SMOTE(random_state=96)
}

results_sampling = {}

for sampling_name, sampler in sampling_strategies.items():
    print(f'\nSampling Strategy: {sampling_name}')
    results_sampling[sampling_name] = {}
    
    for model_name, model in models.items():
        skf = StratifiedKFold(n_splits=5, random_state=96, shuffle=True)
        
        if sampler is not None:
            pipeline = Pipeline([
                ('sampler', sampler),
                ('classifier', model)
            ])
        else:
            pipeline = model

        scores = cross_val_score(
            pipeline,
            df[seletor.features],
            df[feature_target],
            cv=skf,
            scoring='roc_auc'
        )

        results_sampling[sampling_name][model_name] = scores

        print(f'{model_name}: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')



Sampling Strategy: Original
DT: 0.6247 +/- 0.0088
RF: 0.7638 +/- 0.0122
GBT: 0.7991 +/- 0.0101
ADA: 0.7938 +/- 0.0066
XGB: 0.7888 +/- 0.0063
LGBM: 0.8009 +/- 0.0080
DT_bal: 0.6224 +/- 0.0059
RF_bal: 0.7563 +/- 0.0114
GBT_bal: 0.7991 +/- 0.0101
ADA_bal: 0.7938 +/- 0.0066
XGB_bal: 0.7888 +/- 0.0063
LGBM_bal: 0.7990 +/- 0.0059

Sampling Strategy: Undersampling
DT: 0.6478 +/- 0.0095
RF: 0.7715 +/- 0.0110
GBT: 0.7998 +/- 0.0102
ADA: 0.7900 +/- 0.0080
XGB: 0.7804 +/- 0.0101
LGBM: 0.7949 +/- 0.0095
DT_bal: 0.6496 +/- 0.0070
RF_bal: 0.7720 +/- 0.0113
GBT_bal: 0.7999 +/- 0.0100
ADA_bal: 0.7900 +/- 0.0080
XGB_bal: 0.7804 +/- 0.0101
LGBM_bal: 0.7949 +/- 0.0095

Sampling Strategy: Oversampling
DT: 0.6230 +/- 0.0077
RF: 0.7549 +/- 0.0109
GBT: 0.8038 +/- 0.0089
ADA: 0.7958 +/- 0.0060
XGB: 0.7707 +/- 0.0068
LGBM: 0.7978 +/- 0.0069
DT_bal: 0.6214 +/- 0.0083
RF_bal: 0.7550 +/- 0.0089
GBT_bal: 0.8039 +/- 0.0091
ADA_bal: 0.7958 +/- 0.0060
XGB_bal: 0.7707 +/- 0.0068
LGBM_bal: 0.7978 +/- 0.0069

Sampling 