***
## notebook config

In [1]:
ON_COLAB = False

In [2]:
if ON_COLAB:
    from google.colab import drive
    from google.colab import files
    drive.mount('/content/drive', force_remount=True)
    
    !pip install --upgrade kaggle > /dev/null 2>&1
    !mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
    
    !free -h
    
    !pip install --upgrade category_encoders > /dev/null 2>&1
    !pip install --upgrade tsforest > /dev/null 2>&1
    !pip install --upgrade lightgbm > /dev/null 2>&1
    !pip install --upgrade optuna > /dev/null 2>&1

In [3]:
if ON_COLAB:
    !kaggle datasets download -d mavillan/meli-2021 --force --unzip
    !ls -halt
    input_path = "./"
    print("input_path:", input_path)
    subs_path = "/content/drive/MyDrive/meli2021/subs"
    print("subs_path:", subs_path)
    results_path = "/content/drive/MyDrive/meli2021/results"
    print("results_path:", results_path)
else:
    input_path = "../data"
    print("input_path:", input_path)
    subs_path = "../subs"
    print("subs_path:", subs_path)
    results_path = "../results"
    print("results_path:", results_path)

input_path: ../data
subs_path: ../subs
results_path: ../results


***

In [4]:
import numpy as np
import pandas as pd
import category_encoders as ce
import yaml
import dill

np.random.seed(2)

***
## data loading

In [5]:
with open(f"../data/skus_assess_m1.yaml", "r") as file:
    skus_assess_m1 = yaml.load(file, Loader=yaml.FullLoader)
    print(f"len(skus_assess_m1): {len(skus_assess_m1)}")
    file.close()

with open(f"../data/skus_assess_m2.yaml", "r") as file:
    skus_assess_m2 = yaml.load(file, Loader=yaml.FullLoader)
    print(f"len(skus_assess_m2): {len(skus_assess_m2)}")
    file.close()
    
unpredictable = pd.read_csv(f"{input_path}/unpredictable.csv")
print(f"len(unpredictable): {len(unpredictable)}")

skus_for_test = pd.read_csv(f"{input_path}/test_data.csv").sku.values
print(f"len(skus_for_test): {len(skus_for_test)}")

len(skus_assess_m1): 482635
len(skus_assess_m2): 272130
len(unpredictable): 25283
len(skus_for_test): 551472


***
## training encoder for stage1 model

In [7]:
limit_date = "2021-03-01"

train_stg1 = (
    pd.read_parquet(f"{input_path}/train-m1.parquet")
    .query("sku in @skus_assess_m1")
    .rename({"date":"ds", "sold_quantity":"y"}, axis=1,)
    .sort_values(["sku","ds"])
    .query("ds <= @limit_date")
    .query("minutes_active > 0")
    .reset_index(drop=True)
)

In [9]:
len(train_stg1) / 1e6

11.357927

In [10]:
train_stg1.sku.nunique()

482635

In [11]:
categ_cols = ["sku","item_domain_id","item_id","product_id","product_family_id"]

for col in categ_cols:
    train_stg1[col] = train_stg1[col].astype("category")
    
display(train_stg1[categ_cols].nunique())
display(train_stg1[categ_cols].isna().sum(axis=0) / len(train_stg1))

sku                  482635
item_domain_id         8132
item_id              381376
product_id            11182
product_family_id     23713
dtype: int64

sku                  0.000000
item_domain_id       0.000000
item_id              0.000000
product_id           0.967004
product_family_id    0.887731
dtype: float64

In [12]:
%%time
encoder = ce.GLMMEncoder(verbose=True, cols=categ_cols)
encoder.fit(train_stg1[categ_cols], train_stg1["y"])

  elif pd.api.types.is_categorical(cols):


GLMMEncoder(cols=['sku', 'item_domain_id', 'item_id', 'product_id',
                  'product_family_id'],
            verbose=True)

In [13]:
encoder.transform(train_stg1[categ_cols])

Unnamed: 0,sku,item_domain_id,item_id,product_id,product_family_id
0,-0.588666,-0.305412,-0.990623,-1.258883,-0.446041
1,-0.588666,-0.305412,-0.990623,-1.258883,-0.446041
2,-0.588666,-0.305412,-0.990623,-1.258883,-0.446041
3,-0.588666,-0.305412,-0.990623,-1.258883,-0.446041
4,-0.588666,-0.305412,-0.990623,-1.258883,-0.446041
...,...,...,...,...,...
11357922,0.943782,28.868654,0.118177,-1.258883,-0.446041
11357923,0.943782,28.868654,0.118177,-1.258883,-0.446041
11357924,0.943782,28.868654,0.118177,-1.258883,-0.446041
11357925,0.943782,28.868654,0.118177,-1.258883,-0.446041


In [15]:
with open("../encoders/encoder-stg1.dill", "wb") as file:
    dill.dump(encoder, file)
    file.close()

***
## training encoder for stage2 model

In [6]:
train_stg2 = (
    pd.read_parquet(f"{input_path}/train-m1.parquet")
    .query("sku not in @unpredictable.sku")
    #.query("sku in @skus_for_test")
    .rename({"date":"ds", "sold_quantity":"y"}, axis=1,)
    .sort_values(["sku","ds"])
    .query("minutes_active > 0")
    .reset_index(drop=True)
)

In [7]:
len(train_stg2) / 1e6

27.549554

In [8]:
train_stg2.sku.nunique()

635631

In [10]:
categ_cols = ["sku","item_domain_id","item_id","product_id","product_family_id"]

for col in categ_cols:
    train_stg2[col] = train_stg2[col].astype("category")
    
display(train_stg2[categ_cols].nunique())
display(train_stg2[categ_cols].isna().sum(axis=0) / len(train_stg2))

sku                  635631
item_domain_id         8372
item_id              497535
product_id            15237
product_family_id     28816
dtype: int64

sku                  0.000000
item_domain_id       0.000002
item_id              0.000000
product_id           0.963260
product_family_id    0.883679
dtype: float64

In [11]:
%%time
encoder = ce.GLMMEncoder(verbose=True, cols=categ_cols)
encoder.fit(train_stg2[categ_cols], train_stg2["y"])

  elif pd.api.types.is_categorical(cols):


CPU times: user 47min 25s, sys: 28.7 s, total: 47min 54s
Wall time: 47min 54s


GLMMEncoder(cols=['sku', 'item_domain_id', 'item_id', 'product_id',
                  'product_family_id'],
            verbose=True)

In [12]:
encoder.transform(train_stg2[categ_cols])

Unnamed: 0,sku,item_domain_id,item_id,product_id,product_family_id
0,-1.287854,-0.414753,-1.359761,-1.4508,-1.510405
1,-1.287854,-0.414753,-1.359761,-1.4508,-1.510405
2,-1.287854,-0.414753,-1.359761,-1.4508,-1.510405
3,-1.287854,-0.414753,-1.359761,-1.4508,-1.510405
4,-1.287854,-0.414753,-1.359761,-1.4508,-1.510405
...,...,...,...,...,...
27549549,0.551831,22.557324,0.501171,-1.4508,-0.485575
27549550,0.551831,22.557324,0.501171,-1.4508,-0.485575
27549551,0.551831,22.557324,0.501171,-1.4508,-0.485575
27549552,0.551831,22.557324,0.501171,-1.4508,-0.485575


In [13]:
with open("../encoders/encoder-stg2.dill", "wb") as file:
    dill.dump(encoder, file)
    file.close()

***