***
## notebook config

In [1]:
ON_COLAB = False

In [2]:
if ON_COLAB:
    from google.colab import drive
    from google.colab import files
    drive.mount('/content/drive', force_remount=True)
    
    !pip install --upgrade kaggle > /dev/null 2>&1
    !mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
    
    !free -h
    
    !pip install --upgrade category_encoders > /dev/null 2>&1
    !pip install --upgrade tsforest > /dev/null 2>&1
    !pip install --upgrade lightgbm > /dev/null 2>&1
    !pip install --upgrade optuna > /dev/null 2>&1

In [3]:
if ON_COLAB:
    !kaggle datasets download -d mavillan/meli-2021 --force --unzip
    !ls -halt
    input_path = "./"
    print("input_path:", input_path)
    subs_path = "/content/drive/MyDrive/meli2021/subs"
    print("subs_path:", subs_path)
    results_path = "/content/drive/MyDrive/meli2021/results"
    print("results_path:", results_path)
else:
    input_path = "../data"
    print("input_path:", input_path)
    subs_path = "../subs"
    print("subs_path:", subs_path)
    results_path = "../results"
    print("results_path:", results_path)

input_path: ../data
subs_path: ../subs
results_path: ../results


***

In [4]:
import numpy as np
import pandas as pd
import category_encoders as ce
import yaml
import dill

np.random.seed(2)

***
## data loading

In [5]:
with open(f"../data/skus_assess_m1.yaml", "r") as file:
    skus_assess_m1 = yaml.load(file, Loader=yaml.FullLoader)
    print(f"len(skus_assess_m1): {len(skus_assess_m1)}")
    file.close()

with open(f"../data/skus_assess_m2.yaml", "r") as file:
    skus_assess_m2 = yaml.load(file, Loader=yaml.FullLoader)
    print(f"len(skus_assess_m2): {len(skus_assess_m2)}")
    file.close()
    
unpredictable = pd.read_csv(f"{input_path}/unpredictable.csv")
print(f"len(unpredictable): {len(unpredictable)}")

skus_for_test = pd.read_csv(f"{input_path}/test_data.csv").sku.values
print(f"len(skus_for_test): {len(skus_for_test)}")

len(skus_assess_m1): 482635
len(skus_assess_m2): 272130
len(unpredictable): 25314
len(skus_for_test): 551472


***
## training encoder for stage1 model

In [6]:
limit_date = "2021-03-01"

train_stg1 = (
    pd.read_parquet(f"{input_path}/train-m1.parquet")
    .query("sku in @skus_assess_m1")
    .rename({"date":"ds", "sold_quantity":"y"}, axis=1,)
    .sort_values(["sku","ds"])
    .query("ds <= @limit_date")
    .query("minutes_active > 0")
    .reset_index(drop=True)
)

In [7]:
len(train_stg1) / 1e6

11.357927

In [8]:
train_stg1.sku.nunique()

482635

In [9]:
categ_cols = ["sku","item_domain_id", "item_domain_id_glob",
              "item_id","product_id","product_id_glob",
              "product_family_id","product_family_id_glob"]

for col in categ_cols:
    train_stg1[col] = train_stg1[col].astype("category")
    
display(train_stg1[categ_cols].nunique())
display(train_stg1[categ_cols].isna().sum(axis=0) / len(train_stg1))

sku                       482635
item_domain_id              8132
item_domain_id_glob         3536
item_id                   381376
product_id                 11182
product_id_glob            10503
product_family_id          23713
product_family_id_glob     22114
dtype: int64

sku                       0.000000
item_domain_id            0.000000
item_domain_id_glob       0.000000
item_id                   0.000000
product_id                0.967004
product_id_glob           0.967004
product_family_id         0.887731
product_family_id_glob    0.887731
dtype: float64

In [10]:
%%time
encoder = ce.GLMMEncoder(verbose=True, cols=categ_cols)
encoder.fit(train_stg1[categ_cols], train_stg1["y"])

  elif pd.api.types.is_categorical(cols):


CPU times: user 39min 59s, sys: 27.3 s, total: 40min 27s
Wall time: 40min 27s


GLMMEncoder(cols=['sku', 'item_domain_id', 'item_domain_id_glob', 'item_id',
                  'product_id', 'product_id_glob', 'product_family_id',
                  'product_family_id_glob'],
            verbose=True)

In [11]:
encoder.transform(train_stg1[categ_cols])

Unnamed: 0,sku,item_domain_id,item_domain_id_glob,item_id,product_id,product_id_glob,product_family_id,product_family_id_glob
0,-0.588666,-0.305412,-0.336626,-0.990623,-1.258883,-1.196536,-0.446041,-0.441191
1,-0.588666,-0.305412,-0.336626,-0.990623,-1.258883,-1.196536,-0.446041,-0.441191
2,-0.588666,-0.305412,-0.336626,-0.990623,-1.258883,-1.196536,-0.446041,-0.441191
3,-0.588666,-0.305412,-0.336626,-0.990623,-1.258883,-1.196536,-0.446041,-0.441191
4,-0.588666,-0.305412,-0.336626,-0.990623,-1.258883,-1.196536,-0.446041,-0.441191
...,...,...,...,...,...,...,...,...
11357922,0.943782,28.868654,24.170060,0.118177,-1.258883,-1.196536,-0.446041,-0.441191
11357923,0.943782,28.868654,24.170060,0.118177,-1.258883,-1.196536,-0.446041,-0.441191
11357924,0.943782,28.868654,24.170060,0.118177,-1.258883,-1.196536,-0.446041,-0.441191
11357925,0.943782,28.868654,24.170060,0.118177,-1.258883,-1.196536,-0.446041,-0.441191


In [12]:
with open("../encoders/encoder-stg1.dill", "wb") as file:
    dill.dump(encoder, file)
    file.close()

***
## training encoder for stage2 model

In [19]:
train_stg2 = (
    pd.read_parquet(f"{input_path}/train-m1.parquet")
    .query("sku not in @unpredictable.sku")
    #.query("sku in @skus_for_test")
    .rename({"date":"ds", "sold_quantity":"y"}, axis=1,)
    .sort_values(["sku","ds"])
    .query("minutes_active > 0")
    .reset_index(drop=True)
)

In [20]:
len(train_stg2) / 1e6

27.549525

In [21]:
train_stg2.sku.nunique()

635602

In [22]:
categ_cols = ["sku","item_domain_id", "item_domain_id_glob",
              "item_id","product_id","product_id_glob",
              "product_family_id","product_family_id_glob"]

for col in categ_cols:
    train_stg2[col] = train_stg2[col].astype("category")
    
display(train_stg2[categ_cols].nunique())
display(train_stg2[categ_cols].isna().sum(axis=0) / len(train_stg2))

sku                       635602
item_domain_id              8372
item_domain_id_glob         3586
item_id                   497506
product_id                 15227
product_id_glob            14073
product_family_id          28815
product_family_id_glob     26600
dtype: int64

sku                       0.000000
item_domain_id            0.000002
item_domain_id_glob       0.000002
item_id                   0.000000
product_id                0.963261
product_id_glob           0.963261
product_family_id         0.883680
product_family_id_glob    0.883680
dtype: float64

In [23]:
%%time
encoder = ce.GLMMEncoder(verbose=True, cols=categ_cols)
encoder.fit(train_stg2[categ_cols], train_stg2["y"])

  elif pd.api.types.is_categorical(cols):


CPU times: user 1h 3min 52s, sys: 1min 2s, total: 1h 4min 55s
Wall time: 1h 4min 55s


GLMMEncoder(cols=['sku', 'item_domain_id', 'item_domain_id_glob', 'item_id',
                  'product_id', 'product_id_glob', 'product_family_id',
                  'product_family_id_glob'],
            verbose=True)

In [24]:
encoder.transform(train_stg2[categ_cols])

Unnamed: 0,sku,item_domain_id,item_domain_id_glob,item_id,product_id,product_id_glob,product_family_id,product_family_id_glob
0,-1.287728,-0.414751,-0.470194,-1.359623,-1.447952,-1.443393,-1.510371,-1.518757
1,-1.287728,-0.414751,-0.470194,-1.359623,-1.447952,-1.443393,-1.510371,-1.518757
2,-1.287728,-0.414751,-0.470194,-1.359623,-1.447952,-1.443393,-1.510371,-1.518757
3,-1.287728,-0.414751,-0.470194,-1.359623,-1.447952,-1.443393,-1.510371,-1.518757
4,-1.287728,-0.414751,-0.470194,-1.359623,-1.447952,-1.443393,-1.510371,-1.518757
...,...,...,...,...,...,...,...,...
27549520,0.551952,22.557327,18.523302,0.501309,-1.447952,-1.443393,-0.485357,-0.494855
27549521,0.551952,22.557327,18.523302,0.501309,-1.447952,-1.443393,-0.485357,-0.494855
27549522,0.551952,22.557327,18.523302,0.501309,-1.447952,-1.443393,-0.485357,-0.494855
27549523,0.551952,22.557327,18.523302,0.501309,-1.447952,-1.443393,-0.485357,-0.494855


In [25]:
with open("../encoders/encoder-stg2.dill", "wb") as file:
    dill.dump(encoder, file)
    file.close()

***