In [1]:
import numpy as np
import pandas as pd
from scipy import interpolate
from scipy import stats
import time
from tqdm import tqdm
import optuna

pd.set_option('display.max_columns', None)

In [2]:
input_path = "../data"

***

In [3]:
def ranked_probability_score(y_true, y_pred):
    """
    Input
        y_true: np.array of shape 30. 
        y_pred: np.array of shape 30. 
    """
    return ((y_true.cumsum(axis=1) - y_pred.cumsum(axis=1))**2).sum(axis=1).mean()

def scoring_function(y_true, y_pred):
    """
    Input
        y_true: List of Ints of shape Nx1. Contain the target_stock
        y_pred: List of float of shape Nx30. Contain the prob for each day
    """
    y_true_one_hot = np.zeros_like(y_pred, dtype=np.float64)
    y_true_one_hot[range(len(y_true)), y_true-1] = 1
    return ranked_probability_score(y_true_one_hot, y_pred)

***
## loading data

In [4]:
scales = pd.read_csv(f"{input_path}/scales.csv")
skus_for_assess = pd.read_csv(f"{input_path}/skus_for_assess.csv")

dataset = (
    pd.read_parquet(f"{input_path}/train.parquet")
    .query("sku in @skus_for_assess.sku")
    .rename({"date":"ds", "sold_quantity":"y"}, axis=1,)
    .reset_index(drop=True)
)

In [5]:
train_errors = pd.read_csv("../results/train_errors_lgbm-m1.csv")
oof = pd.read_csv("../results/oof_preds_lgbm-m1.csv")

***
## distribution tuning

In [6]:
class InventoryDaysPredictor():

    def __init__(self, train, train_errors):
        self.train = train.copy(deep=True)
        self.train_errors = train_errors.copy(deep=True)

    def fit(self, preds):
        parameters = self.train.groupby('sku').agg({'y':['mean', 'std']}).y
        # replace nan means by overall mean
        idx = parameters[parameters["mean"].isna()].index
        parameters.loc[idx, "mean"] = parameters["mean"].mean()
        # replace zero means by overall mean
        idx = parameters[parameters["mean"] == 0].index
        parameters.loc[idx, "mean"] = parameters["mean"].mean()
        
        # replace nan stds by overall std
        idx = parameters[parameters["std"].isna()].index
        parameters.loc[idx,"std"] = parameters["std"].mean()
        # replace nan stds by overall std
        idx = parameters[parameters["std"] == 0].index
        parameters.loc[idx,"std"] = parameters["std"].mean()
        
        self.parameters = parameters.to_dict()

        predictors = dict()
        days = np.arange(1,31)
        for sku,df in tqdm(preds.groupby("sku")):
            cumpred = df.y_pred.values.cumsum()
            interp = interpolate.interp1d(cumpred, days, bounds_error=False, fill_value=(-np.inf,np.inf))
            predictors[sku] = interp
        self.predictors = predictors

    def predict(self, sku, stock):
        mean = self.parameters['mean'][sku]
        std = self.parameters['std'][sku]
        days_to_stockout = float(np.clip(self.predictors[sku](stock), a_min=1, a_max=30))
        std_days = std/mean
        return days_to_stockout,std_days

    def predict_proba(self, sku, stock, dist_kwargs, lambda1, lambda2):
        days_to_stockout,std_days = self.predict(sku, stock)
        scale = std_days * (lambda1*(days_to_stockout**lambda2))
        days = np.arange(1,31)
        probs = stats.gennorm.pdf(days, loc=days_to_stockout, scale=scale, **dist_kwargs)
        #if prob is zero, replace with uniform
        if np.sum(probs) == 0: return np.ones(30) / 30
        return probs/np.sum(probs)

In [7]:
predictor = InventoryDaysPredictor(dataset.query("ds <= '2021-03-01'"), train_errors)
predictor.fit(oof)

100%|██████████| 496797/496797 [00:59<00:00, 8378.14it/s]


In [9]:
valid = pd.read_csv(f"{input_path}/validation_seed2.csv")

def objective(trial):
    beta = trial.suggest_uniform("beta", 0.1, 2)
    lambda1 = trial.suggest_uniform("lambda1", 0.1, 2.)
    lambda2 = trial.suggest_uniform("lambda2", 0.1, 2.)
    
    preds_proba = list()
    for sku,df in valid.groupby("sku"):
        preds_proba.append(predictor.predict_proba(
            sku, 
            stock=df.target_stock.values[0],
            dist_kwargs={"beta":beta},
            lambda1=lambda1, 
            lambda2=lambda2,
        ))
    preds_proba = np.array(preds_proba)
    
    rps = scoring_function(valid.inventory_days.values, preds_proba)
    return rps

In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10000, timeout=36000, n_jobs=1) # 10-hrs

[32m[I 2021-08-21 00:17:29,970][0m A new study created in memory with name: no-name-68310064-166d-4efa-a1e8-ec4eca3c06c3[0m
[32m[I 2021-08-21 00:19:50,252][0m Trial 0 finished with value: 4.267283156474176 and parameters: {'beta': 0.9414920526416849, 'lambda1': 1.527365220916897, 'lambda2': 1.5224512669471926}. Best is trial 0 with value: 4.267283156474176.[0m
[32m[I 2021-08-21 00:22:00,092][0m Trial 1 finished with value: 4.262710740693368 and parameters: {'beta': 1.7845280443580125, 'lambda1': 0.7816960869786289, 'lambda2': 1.8048102148056553}. Best is trial 1 with value: 4.262710740693368.[0m
[32m[I 2021-08-21 00:24:18,549][0m Trial 2 finished with value: 3.972105383643666 and parameters: {'beta': 0.6174985057043849, 'lambda1': 0.2893304823479643, 'lambda2': 1.4052453773422047}. Best is trial 2 with value: 3.972105383643666.[0m
[32m[I 2021-08-21 00:28:07,086][0m Trial 3 finished with value: 4.07150138524541 and parameters: {'beta': 0.6609908007189903, 'lambda1': 0.4179

In [11]:
study.trials_dataframe().sort_values("value").head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_beta,params_lambda1,params_lambda2,state
155,155,3.924111,2021-08-21 07:51:55.495875,2021-08-21 07:54:56.253475,0 days 00:03:00.757600,0.634505,0.458712,0.967726,COMPLETE
196,196,3.924205,2021-08-21 09:45:37.509854,2021-08-21 09:48:18.851541,0 days 00:02:41.341687,0.58568,0.441999,0.951513,COMPLETE
204,204,3.924494,2021-08-21 10:07:16.435821,2021-08-21 10:09:58.841961,0 days 00:02:42.406140,0.532011,0.324181,0.992352,COMPLETE
131,131,3.9248,2021-08-21 06:44:05.054232,2021-08-21 06:46:49.397127,0 days 00:02:44.342895,0.5414,0.31586,1.048542,COMPLETE
116,116,3.924819,2021-08-21 06:03:08.587958,2021-08-21 06:05:51.307363,0 days 00:02:42.719405,0.535528,0.253455,1.08431,COMPLETE
111,111,3.924951,2021-08-21 05:49:27.119562,2021-08-21 05:52:10.935239,0 days 00:02:43.815677,0.620008,0.394614,1.020233,COMPLETE
174,174,3.924971,2021-08-21 08:46:06.419781,2021-08-21 08:48:49.212537,0 days 00:02:42.792756,0.646814,0.528842,0.869632,COMPLETE
197,197,3.924975,2021-08-21 09:48:18.852475,2021-08-21 09:51:00.290400,0 days 00:02:41.437925,0.582123,0.466146,0.932394,COMPLETE
113,113,3.925002,2021-08-21 05:54:55.556621,2021-08-21 05:57:39.911013,0 days 00:02:44.354392,0.601549,0.42058,1.004548,COMPLETE
198,198,3.925212,2021-08-21 09:51:00.291320,2021-08-21 09:53:43.262359,0 days 00:02:42.971039,0.57982,0.465504,0.933417,COMPLETE


***