In [17]:
import os
import pandas as pd
import numpy as np
import yaml
import logging
import click
import pickle
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split

from utils.training_utils import find_specific_variables

import warnings
warnings.filterwarnings('ignore')

In [23]:
df = pd.read_parquet(os.path.join('..', 'data', 'train_test', 'train_encoded_clf.parquet'))

print(df.shape)
df.head()

(212047, 19)


Unnamed: 0,attributes_color_value,attributes_main_color_value,buying_mode,condition,currency_id,listing_type_id,sale_price_payment_method_type,shipping_logistic_type,site_id,accepts_mercadopago,available_quantity,catalog_listing,installments_price,installments_quantity,price_tratado,sale_price_conditions_eligible,shipping_free_shipping,use_thumbnail_id,promotion_flag
0,1.0,1.0,0.0,1.0,14.0,5.0,1.0,6.0,9.0,1.0,50.0,0.0,-999.0,-999.0,1350.0,1.0,0.0,1.0,0
1,1.0,1.0,1.0,3.0,13.0,3.0,1.0,0.0,10.0,0.0,1.0,0.0,-999.0,-999.0,1200000.0,1.0,0.0,1.0,0
2,1.0,1.0,0.0,1.0,11.0,4.0,1.0,4.0,13.0,1.0,250.0,0.0,20.0,12.0,240.0,1.0,1.0,0.0,1
3,1.0,1.0,0.0,1.0,14.0,5.0,1.0,1.0,9.0,1.0,500.0,0.0,-999.0,-999.0,519.0,1.0,0.0,0.0,0
4,1.0,1.0,0.0,1.0,13.0,0.0,1.0,6.0,10.0,0.0,1.0,0.0,-999.0,-999.0,22.99,1.0,1.0,1.0,0


In [24]:
features = yaml.safe_load(open(os.path.join('..', 'src', 'config', 'feature_config.yaml'), 'r'))
feature_target = find_specific_variables(features, 'target_clf', specific_value=True)

In [25]:
seletor = pickle.load(
    open(os.path.join('..', 'models', 'encoders', 'seletor_2.pkl'), 'rb')
)

seletor.features

['accepts_mercadopago',
 'attributes_color_value',
 'attributes_main_color_value',
 'available_quantity',
 'buying_mode',
 'catalog_listing',
 'condition',
 'currency_id',
 'installments_price',
 'installments_quantity',
 'listing_type_id',
 'price_tratado',
 'sale_price_conditions_eligible',
 'sale_price_payment_method_type',
 'shipping_free_shipping',
 'shipping_logistic_type',
 'site_id',
 'use_thumbnail_id']

In [44]:
scale_pos_weight = df[df[feature_target]==0].shape[0] / df[df[feature_target]==1].shape[0]

models = {
    'RF': RandomForestClassifier(class_weight='balanced'),
    'XGBoost': XGBClassifier(scale_pos_weight=scale_pos_weight),
    'LGBM': LGBMClassifier(class_weight= 'balanced')
}

results = {}

for name, model in models.items():
    skf = StratifiedKFold(n_splits=3, random_state=98, shuffle=True)
    scores = cross_val_score(model, df[seletor.features], df[feature_target], cv = skf, scoring = 'roc_auc')
    results[name] = scores
    print(f'{name}: {round(np.mean(scores), 4)} +/- {round(np.std(scores), 4)}')

RF: 0.8436 +/- 0.0005
XGBoost: 0.8756 +/- 0.0013
LGBM: 0.87 +/- 0.0011
