In [1]:
import numpy as np
import pandas as pd
from scipy import interpolate
from scipy import stats
import time
from tqdm import tqdm
import optuna
import yaml

pd.set_option('display.max_columns', None)

In [2]:
input_path = "../data"

***

In [3]:
def ranked_probability_score(y_true, y_pred):
    """
    Input
        y_true: np.array of shape 30. 
        y_pred: np.array of shape 30. 
    """
    return ((y_true.cumsum(axis=1) - y_pred.cumsum(axis=1))**2).sum(axis=1).mean()

def scoring_function(y_true, y_pred):
    """
    Input
        y_true: List of Ints of shape Nx1. Contain the target_stock
        y_pred: List of float of shape Nx30. Contain the prob for each day
    """
    y_true_one_hot = np.zeros_like(y_pred, dtype=np.float64)
    y_true_one_hot[range(len(y_true)), y_true-1] = 1
    return ranked_probability_score(y_true_one_hot, y_pred)

***
## loading data

In [4]:
scales = pd.read_csv(f"{input_path}/scales.csv")
skus_for_assess = pd.read_csv(f"{input_path}/skus_for_assess.csv")

dataset = (
    pd.read_parquet(f"{input_path}/train.parquet")
    .query("sku in @skus_for_assess.sku")
    .rename({"date":"ds", "sold_quantity":"y"}, axis=1,)
    .reset_index(drop=True)
)

In [5]:
train_errors = pd.read_csv("../results/train_errors_lgbm-m1.csv")
oof = pd.read_csv("../results/oof_preds_lgbm-m1.csv")

***
## distribution tuning

In [6]:
class InventoryDaysPredictor():

    def __init__(self, train, train_errors):
        self.train = train.copy(deep=True)
        self.train_errors = train_errors.copy(deep=True)

    def fit(self, preds):
        parameters = self.train.groupby('sku').agg({'y':['mean', 'std']}).y
        # replace nan means by overall mean
        idx = parameters[parameters["mean"].isna()].index
        parameters.loc[idx, "mean"] = parameters["mean"].mean()
        # replace zero means by overall mean
        idx = parameters[parameters["mean"] == 0].index
        parameters.loc[idx, "mean"] = parameters["mean"].mean()
        
        # replace nan stds by overall std
        idx = parameters[parameters["std"].isna()].index
        parameters.loc[idx,"std"] = parameters["std"].mean()
        # replace nan stds by overall std
        idx = parameters[parameters["std"] == 0].index
        parameters.loc[idx,"std"] = parameters["std"].mean()
        
        self.parameters = parameters.to_dict()

        predictors = dict()
        days = np.arange(1,31)
        for sku,df in tqdm(preds.groupby("sku")):
            cumpred = df.y_pred.values.cumsum()
            interp = interpolate.interp1d(cumpred, days, bounds_error=False, fill_value=(-np.inf,np.inf))
            predictors[sku] = interp
        self.predictors = predictors

    def predict(self, sku, stock):
        mean = self.parameters['mean'][sku]
        std = self.parameters['std'][sku]
        days_to_stockout = float(np.clip(self.predictors[sku](stock), a_min=1, a_max=30))
        std_days = std/mean
        return days_to_stockout,std_days

    def predict_proba(self, sku, stock, dist_kwargs, lambda1, lambda2):
        days_to_stockout,std_days = self.predict(sku, stock)
        scale = std_days * (lambda1*(days_to_stockout**lambda2))
        days = np.arange(1,31)
        probs = stats.gennorm.pdf(days, loc=days_to_stockout, scale=scale, **dist_kwargs)
        #if prob is zero, replace with uniform
        if np.sum(probs) == 0: return np.ones(30) / 30
        return probs/np.sum(probs)

In [8]:
valid = pd.read_csv(f"{input_path}/validation_seed2.csv")

def objective(trial):
    beta = trial.suggest_uniform("beta", 0.1, 1.)
    lambda1 = trial.suggest_uniform("lambda1", 0.1, 2.)
    lambda2 = trial.suggest_uniform("lambda2", 0.1, 2.)
    corr_factor = trial.suggest_uniform("corr_factor", 1., 1.36)
    
    _oof = oof.copy(deep=True)
    _oof["y_pred"] = corr_factor*_oof["y_pred"]
    predictor = InventoryDaysPredictor(dataset.query("ds <= '2021-03-01'"), train_errors)
    predictor.fit(_oof)
    
    preds_proba = list()
    for sku,df in valid.groupby("sku"):
        preds_proba.append(predictor.predict_proba(
            sku, 
            stock=df.target_stock.values[0],
            dist_kwargs={"beta":beta},
            lambda1=lambda1, 
            lambda2=lambda2,
        ))
    preds_proba = np.array(preds_proba)
    
    rps = scoring_function(valid.inventory_days.values, preds_proba)
    return rps

In [9]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10000, timeout=43200, n_jobs=1) # 12-hrs

[32m[I 2021-08-24 23:55:44,151][0m A new study created in memory with name: no-name-093884a8-1ee6-4e19-84e7-80d8b007b598[0m
100%|██████████| 496797/496797 [00:57<00:00, 8621.15it/s]
[32m[I 2021-08-24 23:59:00,992][0m Trial 0 finished with value: 4.0133629383405385 and parameters: {'beta': 0.8487962437732475, 'lambda1': 1.5753361009766287, 'lambda2': 0.3467721935141598, 'corr_factor': 1.2405759526317681}. Best is trial 0 with value: 4.0133629383405385.[0m
100%|██████████| 496797/496797 [00:58<00:00, 8432.53it/s]
[32m[I 2021-08-25 00:02:31,790][0m Trial 1 finished with value: 4.100841803165583 and parameters: {'beta': 0.42577820406268685, 'lambda1': 1.0234466324837155, 'lambda2': 1.1623865668087083, 'corr_factor': 1.3302995724122562}. Best is trial 0 with value: 4.0133629383405385.[0m
100%|██████████| 496797/496797 [00:56<00:00, 8780.42it/s]
[32m[I 2021-08-25 00:05:49,184][0m Trial 2 finished with value: 4.034868256009756 and parameters: {'beta': 0.3923711092390012, 'lambda1':

In [10]:
study.trials_dataframe().sort_values("value").head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_beta,params_corr_factor,params_lambda1,params_lambda2,state
144,144,3.883853,2021-08-25 08:48:13.241799,2021-08-25 08:52:55.884042,0 days 00:04:42.642243,0.560762,1.20253,0.547647,0.867899,COMPLETE
137,137,3.88389,2021-08-25 08:09:56.361399,2021-08-25 08:15:14.115247,0 days 00:05:17.753848,0.570422,1.204522,0.52479,0.850746,COMPLETE
182,182,3.883934,2021-08-25 11:46:21.686370,2021-08-25 11:50:46.682796,0 days 00:04:24.996426,0.531934,1.192431,0.489705,0.833911,COMPLETE
150,150,3.884334,2021-08-25 09:15:54.314901,2021-08-25 09:20:25.618509,0 days 00:04:31.303608,0.558041,1.188219,0.492349,0.899743,COMPLETE
149,149,3.884353,2021-08-25 09:11:29.632094,2021-08-25 09:15:54.313614,0 days 00:04:24.681520,0.566027,1.193197,0.496275,0.901008,COMPLETE
151,151,3.884391,2021-08-25 09:20:25.622646,2021-08-25 09:26:15.881617,0 days 00:05:50.258971,0.566073,1.188428,0.505835,0.89148,COMPLETE
152,152,3.884554,2021-08-25 09:26:15.893201,2021-08-25 09:31:45.967342,0 days 00:05:30.074141,0.564081,1.186089,0.484461,0.900332,COMPLETE
166,166,3.884697,2021-08-25 10:31:02.747565,2021-08-25 10:35:22.008819,0 days 00:04:19.261254,0.531705,1.209883,0.521591,0.873778,COMPLETE
180,180,3.88499,2021-08-25 11:37:49.770996,2021-08-25 11:42:08.652542,0 days 00:04:18.881546,0.53499,1.190358,0.482532,0.81514,COMPLETE
167,167,3.885043,2021-08-25 10:35:22.012156,2021-08-25 10:40:55.067433,0 days 00:05:33.055277,0.538172,1.212774,0.544028,0.87136,COMPLETE


In [14]:
with open(f"../config/dist_hparams.yml", "w") as file:
    yaml.dump(study.best_params, file, default_flow_style=False)
    file.close()

***