# Models

In [1]:
import os
root_dir = '../../' if os.getcwd().split('/')[-1] != 'customer' else './'
os.chdir(root_dir)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna import Trial, visualization

from tqdm import tqdm
from vecstack import StackingTransformer
from itertools import combinations
import joblib

import customer_data

def NMAE(true, pred):
    mae = mean_absolute_error(true,pred)
    score = mae / np.mean(np.abs(true))
    return score

In [2]:
train_data, test_data, train_label, validation_set = customer_data.load_data()
x_train, x_test, y_train, y_test = validation_set
print(train_data.shape, test_data.shape, train_label.shape)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1102, 47) (1108, 47) (1102, 1)
(771, 47) (331, 47) (771, 1) (331, 1)


## ML Models

In [3]:
models = [
    ('LGBMRegressor',LGBMRegressor(n_estimators=134,max_depth=16,random_state=0)),
    ('XGBRegressor',XGBRegressor(n_estimators=194,max_depth=7,random_state=0,verbosity=0)),
    ('CatBoostRegressor',CatBoostRegressor(n_estimators=1200,max_depth=8,random_state=0,verbose=0)),
    ('GradientBoostingRegressor',GradientBoostingRegressor(n_estimators=1301,max_depth=9,learning_rate=0.01,subsample=0.5,random_state=0)),
    ('ExtraTreesRegressor',ExtraTreesRegressor(n_estimators=344,max_depth=16,random_state=0,criterion='absolute_error')),
    ('RandomForestRegressor',RandomForestRegressor(n_estimators=177,max_depth=16,random_state=0,criterion='absolute_error'))]

In [4]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', NMAE(y_test,model.predict(x_test)))

LGBMRegressor:  0.1844852959826549
XGBRegressor:  0.186055533424502
CatBoostRegressor:  0.17963770749402333
GradientBoostingRegressor:  0.17613013821918308
ExtraTreesRegressor:  0.19205121591509647
RandomForestRegressor:  0.19746425855670233


## Soft Voting

In [5]:
pred_list = list()
for name, model in models:
    pred_list.append([model.predict(x_test),name])

pred_comb = list()
for i in range(2,len(pred_list)+1):
    pred_comb += list(combinations(pred_list, i))

In [6]:
nmae_list = list()

for pred_info in pred_comb:
    preds, names = list(), list()

    for pred, name in pred_info:
        preds.append(pred)
        names.append(name)
    nmae_list.append([NMAE(y_test,sum(preds)/len(preds)),names])

In [None]:
sorted(nmae_list)[0]

[0.17293862256220768,
 ['LGBMRegressor',
  'XGBRegressor',
  'CatBoostRegressor',
  'GradientBoostingRegressor']]

## Hard Voting

In [14]:
estimator = [
    ('LGBMRegressor',LGBMRegressor(n_estimators=134,max_depth=16,random_state=0)),
    ('XGBRegressor',XGBRegressor(n_estimators=194,max_depth=7,random_state=0,verbosity=0)),
    ('CatBoostRegressor',CatBoostRegressor(n_estimators=1800,max_depth=8,random_state=0,verbose=0)),
    ('GradientBoostingRegressor',GradientBoostingRegressor(n_estimators=1301,max_depth=9,learning_rate=0.01,subsample=0.5,random_state=0)),
    ('ExtraTreesRegressor',ExtraTreesRegressor(n_estimators=1746,max_depth=15,random_state=0,criterion='absolute_error')),
    ('RandomForestRegressor',RandomForestRegressor(n_estimators=177,max_depth=16,random_state=0,criterion='absolute_error'))]

In [15]:
estimators = list()
for i in range(3,len(estimator)+1):
    estimators += list(combinations(estimator, i))

In [16]:
stacks = [StackingTransformer(estimator,
                            regression = True,
                            metric = NMAE,
                            n_folds = 10, stratified = True, shuffle = True,
                            random_state = 0, verbose = 0)
                            for estimator in estimators]

In [17]:
S_models = [
    ('LGBMRegressor',LGBMRegressor(n_estimators=100,max_depth=2,random_state=0)),
    ('XGBRegressor',XGBRegressor(n_estimators=100,max_depth=2,random_state=0,verbosity=0)),
    ('CatBoostRegressor',CatBoostRegressor(n_estimators=100,max_depth=2,random_state=0,verbose=0)),
    ('GradientBoostingRegressor',GradientBoostingRegressor(n_estimators=100,max_depth=2,learning_rate=0.01,subsample=0.5,random_state=0)),
    ('ExtraTreesRegressor',ExtraTreesRegressor(n_estimators=100,max_depth=2,random_state=0,criterion='absolute_error')),
    ('RandomForestRegressor',RandomForestRegressor(n_estimators=100,max_depth=2,random_state=0,criterion='absolute_error'))]

In [None]:
name_scores = list()

for i, stack in enumerate(stacks):
    S_train = stack.fit_transform(x_train, y_train)
    S_test = stack.transform(x_test)

    for name, model in S_models:
        model.fit(S_train, y_train)
        y_pred = model.predict(S_test)
        print(i, NMAE(y_test, y_pred), name)
        name_scores.append((NMAE(y_test, y_pred), (stack, model)))

In [None]:
sorted(name_scores)

## Submission (Soft Voting)

In [9]:
sample = pd.read_csv('original_data/sample_submission.csv')

In [10]:
models = [
    ('LGBMRegressor',LGBMRegressor(n_estimators=134,max_depth=16,random_state=0)),
    ('XGBRegressor',XGBRegressor(n_estimators=194,max_depth=7,random_state=0,verbosity=0)),
    ('CatBoostRegressor',CatBoostRegressor(n_estimators=1800,max_depth=8,random_state=0,verbose=0)),
    ('GradientBoostingRegressor',GradientBoostingRegressor(n_estimators=1301,max_depth=9,learning_rate=0.01,subsample=0.5,random_state=0))]

In [12]:
pred = None
for name, model in models:
    model.fit(train_data, train_label)
    if pred is None:
        pred = model.predict(test_data)
    else:
        pred += model.predict(test_data)
sample['target'] = pred/len(models)
sample.head()

Unnamed: 0,id,target
0,0,560.310711
1,1,834.140626
2,2,801.504095
3,3,1283.291736
4,4,1351.00951


In [13]:
sample.to_csv('submission/sample_soft.csv', index=False)

## Submission (Hard Voting)

In [17]:
estimator = [
    ('XGBRegressor',XGBRegressor()),
    ('GradientBoostingRegressor',GradientBoostingRegressor()),
    ('ExtraTreesRegressor',ExtraTreesRegressor())]

In [18]:
stack = StackingTransformer(estimator,
                            regression = True,
                            metric = NMAE,
                            n_folds = 10, stratified = True, shuffle = True,
                            random_state = 0, verbose = 0)

In [10]:
def objective_rf(trial: Trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators',100,1000,100),
        'max_depth': trial.suggest_int('max_depth',2,16,2),
        'max_features': trial.suggest_categorical('max_features',['auto','sqrt','log2'])
    }

    model = RandomForestRegressor(**params,random_state=0,criterion='absolute_error')
    bst = model.fit(S_train, y_train)
    y_pred = bst.predict(S_test)
    nmae = NMAE(y_test,y_pred)
    return nmae

In [None]:
study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf,n_trials=100,show_progress_bar=True)

In [12]:
trial_rf = study_rf.best_trial
print('NMAE:', trial_rf.value)
print('Best Hyperparameters:', trial_rf.params)

NMAE: 0.18562215443542815
Best Hyperparameters: {'n_estimators': 500, 'max_depth': 4, 'max_features': 'auto'}


In [19]:
model = RandomForestRegressor(**{'n_estimators': 500, 'max_depth': 4, 'max_features': 'auto'})

In [21]:
S_train = stack.fit_transform(train_data, train_label)
S_test = stack.transform(test_data)

model.fit(S_train, train_label)
sample['target'] = model.predict(S_test)
sample.head()

Unnamed: 0,id,target
0,0,584.341598
1,1,919.201658
2,2,721.753527
3,3,1388.846843
4,4,1441.300221


In [22]:
sample.to_csv('submission/sample_stacking.csv',index=False)

In [None]:
joblib.dump(stack, 'models/stack.pkl', compress=3)
joblib.dump(model, 'models/model_stack.pkl', compress=3)