<a href="https://colab.research.google.com/github/mavillan/yolanda/blob/main/notebooks/tune_loss_lgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***
## notebook config

In [1]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install --upgrade kaggle > /dev/null 2>&1
!mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [3]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [4]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            35G        1.1G         27G        1.1M        6.7G         33G
Swap:            0B          0B          0B


In [5]:
!pip install --upgrade category_encoders > /dev/null 2>&1
!pip install --upgrade tsforest > /dev/null 2>&1
#!pip install --upgrade lightgbm > /dev/null 2>&1

In [None]:
%%bash
git clone --recursive https://github.com/microsoft/LightGBM ; cd LightGBM
mkdir build
#export CMAKE_CXX_FLAGS='-O3 -mtune=native'
cmake -DUSE_GPU=1
make -j$(nproc)
cd ./python-package/
python setup.py install --precompile

# THIS IS FOR GPU INSTALLATION

In [None]:
%%bash
git clone --recursive https://github.com/microsoft/LightGBM ; cd LightGBM
mkdir build ; cd build
export CMAKE_CXX_FLAGS='-O3 -mtune=native'
cmake ..
make -j$(nproc)
cd ../python-package/
python setup.py install --precompile

# THIS IS FOR CPU (OPTIMIZED) INSTALLATION

***

In [6]:
from copy import deepcopy
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

import category_encoders as ce
import lightgbm as lgb
from tsforest.forecast import LightGBMForecaster

pd.set_option('display.max_columns', None)

  import pandas.util.testing as tm


In [7]:
def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
class RMSSE():
    def __init__(self, valid_dataframe, scales_dataframe):
        self.valid_dataframe = valid_dataframe
        self.scales_dataframe = scales_dataframe

    def _evaluate(self, predictions):
        valid_dataframe = self.valid_dataframe.copy()
        valid_dataframe["ypred"] = predictions
        valid_dataframe["sq_error"] = valid_dataframe.eval("(y-ypred)**2")
        mse = valid_dataframe.groupby("sku")["sq_error"].mean().reset_index(name="mse")
        mrg = pd.merge(mse, self.scales_dataframe, how="inner", on="sku")
        return mrg.eval("sqrt(mse)/scale").mean()

    def evaluate(self, ypred, dtrain):
        metric = self._evaluate(ypred)
        return "rmsse", metric, False

***
## data preparation

In [9]:
!kaggle datasets download -d mavillan/meli-2021 --force --unzip
!ls -halt 

Downloading meli-2021.zip to /content
 96% 218M/227M [00:02<00:00, 125MB/s]
100% 227M/227M [00:02<00:00, 98.5MB/s]
total 316M
drwxr-xr-x  1 root root 4.0K Aug 10 15:50 .
-rw-r--r--  1 root root 169K Aug 10 15:50 unpredictable.csv
-rw-r--r--  1 root root  78M Aug 10 15:50 train.parquet
-rw-r--r--  1 root root 223M Aug 10 15:50 test.parquet
-rw-r--r--  1 root root  16M Aug 10 15:49 scales.csv
drwx------  5 root root 4.0K Aug 10 15:49 drive
drwxr-xr-x 21 root root 4.0K Aug 10 14:43 LightGBM
drwxr-xr-x  1 root root 4.0K Aug 10 14:40 ..
drwxr-xr-x  1 root root 4.0K Jul 16 13:20 sample_data
drwxr-xr-x  4 root root 4.0K Jul 16 13:19 .config


In [10]:
dataset = pd.read_parquet("./train.parquet")
dataset.rename({"date":"ds", "sold_quantity":"y"}, axis=1, inplace=True)
dataset

Unnamed: 0,ds,sku,y,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id
0,2021-02-01,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
1,2021-02-02,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
2,2021-02-03,0,0,179.990005,REA,classic,cross_docking,free_shipping,872.650024,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
3,2021-02-04,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
4,2021-02-05,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37660274,2021-03-27,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
37660275,2021-03-28,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
37660276,2021-03-29,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
37660277,2021-03-30,660915,0,99.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,


In [11]:
unpredictable = pd.read_csv("unpredictable.csv")
scales = pd.read_csv("scales.csv")

dataset = dataset.query("sku not in @unpredictable.sku").reset_index(drop=True)
dataset

Unnamed: 0,ds,sku,y,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id
0,2021-02-01,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
1,2021-02-02,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
2,2021-02-03,0,0,179.990005,REA,classic,cross_docking,free_shipping,872.650024,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
3,2021-02-04,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
4,2021-02-05,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36594391,2021-03-27,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
36594392,2021-03-28,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
36594393,2021-03-29,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
36594394,2021-03-30,660915,0,99.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,


In [12]:
categorical_features = {
    "sku": ("y", ce.CatBoostEncoder, None),
    "currency": "default",
    "listing_type": "default",
    "shipping_logistic_type": "default",
    "shipping_payment": "default",
    "item_domain_id": ("y", ce.CatBoostEncoder, None),
    "item_id": ("y", ce.CatBoostEncoder, None),
    "site_id":"default",
    "product_id": ("y", ce.CatBoostEncoder, None),
    "product_family_id": ("y", ce.CatBoostEncoder, None),
}

exclude_features = ["fold","sold_quantity",]

In [13]:
dataset.loc[:,categorical_features.keys()].nunique()

sku                       635633
currency                       4
listing_type                   2
shipping_logistic_type         3
shipping_payment               2
item_domain_id              8372
item_id                   497537
site_id                        3
product_id                 15237
product_family_id          28817
dtype: int64

In [14]:
dataset.groupby("site_id")["sku"].nunique()

site_id
MLA     55514
MLB    323896
MLM    256223
Name: sku, dtype: int64

***
## model config

In [15]:
# model configuration
model_params = {
    'metric':'None',
    'num_iterations':500,
    'max_bin': 63,
    'bin_construct_sample_cnt':20000000,
    'num_leaves': 2**8-1,
    'min_data_in_leaf': 2**8-1,
    'learning_rate': 0.05,
    'feature_fraction':0.8,
    'bagging_fraction':0.8,
    'bagging_freq':1,
    'lambda_l2':0.1,
    'boost_from_average': False,
    'force_row_wise': True,
    #'device_type': 'gpu',
    #'num_gpu' : 1,
}
time_features = [
    "week_day",
    "week_day_cos",
    "week_day_sin",
    "month_progress",
]
model_kwargs = {
    "model_params":model_params,
    "time_features":time_features,
    "exclude_features":exclude_features,
    "categorical_features":categorical_features,
    "ts_uid_columns":["sku",],
}

***
## Model assessment: huber loss

In [16]:
base_models = list()

for site in ["MLA","MLB","MLM"]:
    
    print("-"*75)
    print(f"Preparing model for site: {site}")
    model = LightGBMForecaster(**model_kwargs)

    tic = time.time()
    skus_both = set(dataset.query("ds <= '2021-03-01'").sku) & set(dataset.query("ds > '2021-03-01'").sku)
    train_data = dataset.query("site_id == @site & sku in @skus_both").reset_index(drop=True)
    valid_idx = train_data.query("ds > '2021-03-01'").index
    model.prepare_features(train_data, valid_idx)
    model.train_features = reduce_mem_usage(model.train_features, verbose=True)
    model.valid_features = reduce_mem_usage(model.valid_features, verbose=True)
    base_models.append(model)
    gc.collect()
    tac = time.time()
    print(f"Elapsed time for processing features: {(tac-tic)/60} min.")

---------------------------------------------------------------------------
Preparing model for site: MLA
Mem. usage decreased to 142.60 Mb (42.9% reduction)
Mem. usage decreased to 109.13 Mb (51.0% reduction)
Elapsed time for processing features: 0.5590566992759705 min.
---------------------------------------------------------------------------
Preparing model for site: MLB
Mem. usage decreased to 931.03 Mb (40.6% reduction)
Mem. usage decreased to 645.92 Mb (51.0% reduction)
Elapsed time for processing features: 2.6765679836273195 min.
---------------------------------------------------------------------------
Preparing model for site: MLM
Mem. usage decreased to 801.00 Mb (38.5% reduction)
Mem. usage decreased to 509.43 Mb (51.0% reduction)
Elapsed time for processing features: 2.018024524052938 min.


In [17]:
results = list()

for alpha in np.arange(0.2, 2.01, 0.2):

    _model_params = dict(model_params)
    _model_params["objective"] = "huber"
    _model_params["alpha"] = alpha

    models_by_site = list()

    for idx,site in enumerate(["MLA","MLB","MLM"]):
        
        print("-"*75)
        print(f"Training model for site: {site}")
        model = deepcopy(base_models[idx])
        model.set_params(_model_params)

        train_data = dataset.query("site_id == @site & sku in @skus_both").reset_index(drop=True)
        valid_idx = train_data.query("ds > '2021-03-01'").index

        tic = time.time()
        evaluator = RMSSE(train_data.loc[valid_idx, ["sku","y"]], scales)
        model.fit(fit_kwargs={"verbose_eval":50, "feval":evaluator.evaluate})
        models_by_site.append(model)
        tac = time.time()
        print(f"Elapsed time for training the model: {(tac-tic)/60} min.")

    errors = list()
    sites = ["MLA","MLB","MLM"]
    for site,model in zip(sites,models_by_site):
        error = model.model.model.best_score["valid_0"]["rmsse"]
        errors.append(error)
        print(site, f"error: {error}")

    results.append({
        "alpha":alpha, 
        "rmsse_MLA":errors[0], 
        "rmsse_MLB":errors[1],
        "rmsse_MLM":errors[2],
        "rmsse_AVG":np.mean(errors),
    })

    del models_by_site
    gc.collect()

---------------------------------------------------------------------------
Training model for site: MLA
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 1515336, number of used features: 14
Training until validation scores don't improve for 30 rounds
[50]	valid_0's rmsse: 0.837888
[100]	valid_0's rmsse: 0.826624
[150]	valid_0's rmsse: 0.818954
[200]	valid_0's rmsse: 0.813477
[250]	valid_0's rmsse: 0.809757
[300]	valid_0's rmsse: 0.807328
[350]	valid_0's rmsse: 0.805413
[400]	valid_0's rmsse: 0.803903
[450]	valid_0's rmsse: 0.802727
[500]	valid_0's rmsse: 0.801752
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmsse: 0.801752
Elapsed time for training the model: 2.0044432282447815 min.
---------------------------------------------------------------------------
Training model for site: MLB
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 9024166, number of used features: 14
Training unti

In [19]:
pd.DataFrame(results).sort_values("rmsse_AVG", ascending=True)

Unnamed: 0,alpha,rmsse_MLA,rmsse_MLB,rmsse_MLM,rmsse_AVG
5,1.2,0.773727,0.827653,0.757361,0.786247
4,1.0,0.773872,0.828191,0.757241,0.786434
6,1.4,0.774022,0.827604,0.757809,0.786478
7,1.6,0.774106,0.827517,0.75846,0.786695
8,1.8,0.774182,0.827577,0.758727,0.786829
9,2.0,0.774295,0.8277,0.759127,0.787041
3,0.8,0.774786,0.829925,0.758036,0.787583
2,0.6,0.778498,0.833971,0.760988,0.791153
1,0.4,0.785907,0.842851,0.76786,0.798873
0,0.2,0.801752,0.861092,0.783388,0.815411


***
## Model assessment: tweedie loss

In [21]:
results = list()

for power in np.arange(1.0, 1.9, 0.1):

    _model_params = dict(model_params)
    _model_params["objective"] = "tweedie"
    _model_params["tweedie_variance_power"] = power

    models_by_site = list()

    for idx,site in enumerate(["MLA","MLB","MLM"]):
        
        print("-"*75)
        print(f"Training model for site: {site}")
        model = deepcopy(base_models[idx])
        model.set_params(_model_params)

        train_data = dataset.query("site_id == @site & sku in @skus_both").reset_index(drop=True)
        valid_idx = train_data.query("ds > '2021-03-01'").index

        tic = time.time()
        evaluator = RMSSE(train_data.loc[valid_idx, ["sku","y"]], scales)
        model.fit(fit_kwargs={"verbose_eval":50, "feval":evaluator.evaluate})
        models_by_site.append(model)
        tac = time.time()
        print(f"Elapsed time for training the model: {(tac-tic)/60} min.")

    errors = list()
    sites = ["MLA","MLB","MLM"]
    for site,model in zip(sites,models_by_site):
        error = model.model.model.best_score["valid_0"]["rmsse"]
        errors.append(error)
        print(site, f"error: {error}")

    results.append({
        "power":power, 
        "rmsse_MLA":errors[0], 
        "rmsse_MLB":errors[1],
        "rmsse_MLM":errors[2],
        "rmsse_AVG":np.mean(errors),
    })

    del models_by_site
    gc.collect()

---------------------------------------------------------------------------
Training model for site: MLA
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 1515336, number of used features: 14
Training until validation scores don't improve for 30 rounds
[50]	valid_0's rmsse: 0.784647
[100]	valid_0's rmsse: 0.7824
Early stopping, best iteration is:
[75]	valid_0's rmsse: 0.781962
Elapsed time for training the model: 0.40308866103490193 min.
---------------------------------------------------------------------------
Training model for site: MLB
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 9024166, number of used features: 14
Training until validation scores don't improve for 30 rounds
[50]	valid_0's rmsse: 0.848195
[100]	valid_0's rmsse: 0.83674
[150]	valid_0's rmsse: 0.835146
[200]	valid_0's rmsse: 0.834788
[250]	valid_0's rmsse: 0.83472
[300]	valid_0's rmsse: 0.834705
Early stopping, best iteration is:

In [22]:
pd.DataFrame(results).sort_values("rmsse_AVG", ascending=True)

Unnamed: 0,power,rmsse_MLA,rmsse_MLB,rmsse_MLM,rmsse_AVG
6,1.6,0.778674,0.831727,0.765566,0.791989
7,1.7,0.778715,0.831915,0.765418,0.792016
8,1.8,0.778741,0.832147,0.765804,0.792231
5,1.5,0.778819,0.831644,0.766254,0.792239
4,1.4,0.779144,0.831644,0.767906,0.792898
3,1.3,0.779806,0.83218,0.770832,0.794272
2,1.2,0.780562,0.8327,0.774321,0.795861
1,1.1,0.781211,0.833423,0.776249,0.796961
0,1.0,0.781962,0.834688,0.778918,0.798523


***