# Models

In [1]:
import os
root_dir = '../../' if os.getcwd().split('/')[-1] != 'customer' else './'
os.chdir(root_dir)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, OrthogonalMatchingPursuit
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from tqdm import tqdm
from vecstack import StackingTransformer
from itertools import combinations
import joblib

import customer_data

def NMAE(true, pred):
    mae = mean_absolute_error(true,pred)
    score = mae / np.mean(np.abs(true))
    return score

In [2]:
train_data, test_data, train_label, validation_set = customer_data.load_data()
x_train, x_test, y_train, y_test = validation_set
print(train_data.shape, test_data.shape, train_label.shape)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1102, 86) (1108, 86) (1102, 1)
(771, 86) (331, 86) (771, 1) (331, 1)


In [3]:
params = {
    'LGBMRegressor':{
        'n_estimators':100,'max_depth':8,'random_state':0
    },
    'XGBRegressor':{
        'n_estimators':100,'max_depth':4,'random_state':0,'verbosity':0
    },
    'CatBoostRegressor':{
        'n_estimators':1000,'max_depth':8,'learning_rate':0.01,'subsample':0.5,
        'random_state':0,'verbose':0
    },
    'GradientBoostingRegressor':{
        'n_estimators':1000,'max_depth':6,'learning_rate':0.01,'subsample':0.5
    },
    'ExtraTreesRegressor':{
        'n_estimators':300,'max_depth':16,'random_state':0,'criterion':'absolute_error'
    },
    'RandomForestRegressor':{
        'n_estimators':300,'max_depth':14,'random_state':0,'criterion':'absolute_error'
    }
}

## ML Models

In [4]:
models = [
    ('LGBMRegressor',LGBMRegressor()),
    ('XGBRegressor',XGBRegressor()),
    ('CatBoostRegressor',CatBoostRegressor()),
    ('GradientBoostingRegressor',GradientBoostingRegressor()),
    ('ExtraTreesRegressor',ExtraTreesRegressor()),
    ('RandomForestRegressor',RandomForestRegressor()),
    ('OrthogonalMatchingPursuit',OrthogonalMatchingPursuit()),
    ('ElasticNet',ElasticNet()),
    ('BayesianRidge',BayesianRidge()),
    ('LinearRegression',LinearRegression())]

In [5]:
models = [(label,model.set_params(**params[label])) if label in params else (label,model) for label,model in models]

In [6]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', NMAE(y_test,model.predict(x_test)))

LGBMRegressor:  0.20878214283479787
XGBRegressor:  0.20934568767583003
CatBoostRegressor:  0.206561301309736
GradientBoostingRegressor:  0.19070416108432542
ExtraTreesRegressor:  0.1912354390017572
RandomForestRegressor:  0.20928070313991062
OrthogonalMatchingPursuit:  0.30625332980423375
ElasticNet:  0.3055855696320235
BayesianRidge:  0.2742459919886361
LinearRegression:  0.28472921479199986


## Stacking

In [10]:
estimator = [
    ('LGBMRegressor',LGBMRegressor()),
    ('XGBRegressor',XGBRegressor()),
    ('CatBoostRegressor',CatBoostRegressor()),
    ('GradientBoostingRegressor',GradientBoostingRegressor()),
    ('ExtraTreesRegressor',ExtraTreesRegressor()),
    ('RandomForestRegressor',RandomForestRegressor())]

In [11]:
estimator = [(label,model.set_params(**params[label])) for label,model in estimator if label in params]
estimators = list()
for i in range(2,len(estimator)+1):
    estimators += list(combinations(estimator, i))

In [12]:
stacks = [StackingTransformer(estimator, 
                            regression = True, 
                            metric = NMAE, 
                            n_folds = 10, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)
                            for estimator in estimators]

In [13]:
S_models = [
    ('LGBMRegressor',LGBMRegressor()),
    ('XGBRegressor',XGBRegressor()),
    ('CatBoostRegressor',CatBoostRegressor()),
    ('GradientBoostingRegressor',GradientBoostingRegressor()),
    ('ExtraTreesRegressor',ExtraTreesRegressor()),
    ('RandomForestRegressor',RandomForestRegressor())]

In [14]:
S_models = [(label,model.set_params(**params[label])) for label,model in S_models if label in params]

In [15]:
name_scores = list()

for stack in tqdm(stacks, desc='Stack'):
    stack = stack.fit(x_train, y_train)

    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)

    for name, model in S_models:
        model.fit(S_train, y_train)
        y_pred = model.predict(S_test)
        name_scores.append((NMAE(y_test, y_pred), (stack, model)))

Stack:   2%|▏         | 1/57 [00:09<08:54,  9.55s/it]

In [13]:
sorted(name_scores, key=lambda x: x[0])

[(0.18472133243212052,
  (StackingTransformer(estimators=(('XGBRegressor',
                                    XGBRegressor(base_score=None, booster=None,
                                                 colsample_bylevel=None,
                                                 colsample_bynode=None,
                                                 colsample_bytree=None, gamma=None,
                                                 gpu_id=None,
                                                 importance_type='gain',
                                                 interaction_constraints=None,
                                                 learning_rate=None,
                                                 max_delta_step=None, max_depth=4,
                                                 min_child_weight=None,
                                                 missing=nan,
                                                 monotone_constraints=None,
                                        

In [None]:
train.shape,

(1096, 55)

In [None]:
model = model[0]
model.fit(x_train,y_train)

LGBMRegressor(max_depth=8, random_state=0)

In [None]:
sample = pd.read_csv('data/sample_submission.csv')
sample.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [None]:
pred = model.predict(test)

In [None]:
sample['target'] = pred
sample.head()

Unnamed: 0,id,target
0,0,399.274426
1,1,786.866137
2,2,747.371339
3,3,1090.668171
4,4,1356.801403


In [None]:
sample.to_csv('sample.csv',index=False)

In [None]:
train['target'].describe()

count    1096.000000
mean      621.876825
std       604.363476
min         8.000000
25%        71.000000
50%       418.500000
75%      1074.250000
max      2525.000000
Name: target, dtype: float64

In [None]:
import customer_data

In [None]:
train_data, test_data, train_label, validation_set = customer_data.load_data()

In [None]:
train_data.shape, test_data.shape, train_label.shape

((1096, 55), (1108, 55), (1096, 1))

In [None]:
model = models[0][1]
model.fit(train_data,train_label)
pred = model.predict(test_data)

In [None]:
sample = pd.read_csv('original_data/sample_submission.csv')
sample['target'] = pred
sample.head()

Unnamed: 0,id,target
0,0,321.246795
1,1,673.107532
2,2,401.093269
3,3,777.059295
4,4,823.490545


In [None]:
sample.to_csv('sample_et.csv',index=False)