***
## notebook config

In [1]:
ON_COLAB = True

In [2]:
if ON_COLAB:
    from google.colab import drive
    from google.colab import files
    drive.mount('/content/drive', force_remount=True)
    
    !pip install --upgrade kaggle > /dev/null 2>&1
    !mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
    
    !free -h
    
    !pip install --upgrade category_encoders > /dev/null 2>&1
    !pip install --upgrade jsonlines > /dev/null 2>&1
    !pip install --upgrade pandarallel > /dev/null 2>&1

Mounted at /content/drive
              total        used        free      shared  buff/cache   available
Mem:            35G        970M         31G        1.1M        2.3G         33G
Swap:            0B          0B          0B


In [3]:
if ON_COLAB:
    !kaggle datasets download -d mavillan/meli-2021 --force --unzip
    !ls -halt
    input_path = "./"
    print("input_path:", input_path)
    output_path = "/content/drive/MyDrive/meli2021/data"
    print("output_path:", output_path)
else:
    input_path = "../data"
    print("input_path:", input_path)
    output_path = "../data"
    print("output_path:", output_path)

Downloading meli-2021.zip to /content
 97% 299M/309M [00:01<00:00, 231MB/s]
100% 309M/309M [00:01<00:00, 213MB/s]
total 518M
drwxr-xr-x 1 root root 4.0K Aug 22 00:25 .
-rw-r--r-- 1 root root 5.7M Aug 22 00:25 validation_seed23.csv
-rw-r--r-- 1 root root 5.7M Aug 22 00:25 validation_seed2.csv
-rw-r--r-- 1 root root 5.7M Aug 22 00:25 validation_seed19.csv
-rw-r--r-- 1 root root 5.7M Aug 22 00:25 validation_seed17.csv
-rw-r--r-- 1 root root 169K Aug 22 00:25 unpredictable.csv
-rw-r--r-- 1 root root  78M Aug 22 00:25 train_data.parquet
-rw-r--r-- 1 root root  78M Aug 22 00:25 train.parquet
-rw-r--r-- 1 root root 4.9M Aug 22 00:25 test_data.csv
-rw-r--r-- 1 root root 223M Aug 22 00:25 test.parquet
-rw-r--r-- 1 root root 3.3M Aug 22 00:25 skus_for_assess.csv
-rw-r--r-- 1 root root  16M Aug 22 00:25 scales.csv
-rw-r--r-- 1 root root  94M Aug 22 00:25 sample_submission.csv
drwx------ 5 root root 4.0K Aug 22 00:24 drive
drwxr-xr-x 1 root root 4.0K Aug 21 23:59 ..
drwxr-xr-x 1 root root 4.0K Aug

***

In [4]:
import numpy as np
import pandas as pd
import jsonlines
from tqdm import tqdm
from joblib import Parallel, delayed
import category_encoders as ce
from pandarallel import pandarallel

pd.set_option('display.max_columns', None)
pandarallel.initialize()

  import pandas.util.testing as tm


INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

***
## preparing train data

In [6]:
train = pd.read_parquet(f"{input_path}/train_data.parquet")
train["date"] = pd.to_datetime(train["date"])
train["weekday"] = (train.date.dt.weekday+1).astype(str)
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,weekday
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,2
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,3
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,4
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000,5
...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767,3
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333,3
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633,3
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667,3


In [7]:
train.groupby("sku")["sold_quantity"].mean()

sku
0         0.186441
1         0.372881
2         0.043478
3         0.135593
4         0.661017
            ...   
660911    0.322034
660912    0.237288
660913    0.169492
660914    0.389831
660915    0.745763
Name: sold_quantity, Length: 660916, dtype: float64

In [8]:
train["tmp"] = train.eval("sold_quantity*minutes_active")
q_mean = (train.groupby("sku")["tmp"].sum() / train.groupby("sku")["minutes_active"].sum()).reset_index(name="q_mean")
train = train.merge(q_mean, how="inner", on="sku")
train.drop("tmp", axis=1, inplace=True)

train["tmp"]  = train.eval("minutes_active * (sold_quantity - q_mean)**2")
q_std = (train.groupby("sku")["tmp"].sum() / train.groupby("sku")["minutes_active"].sum()).reset_index(name="q_std")
train = train.merge(q_std, how="inner", on="sku")
train.drop("tmp", axis=1, inplace=True)

In [9]:
train.loc[:,["currency","listing_type","shipping_logistic_type","shipping_payment"]].nunique()

currency                  4
listing_type              2
shipping_logistic_type    3
shipping_payment          2
dtype: int64

In [10]:
encoder = ce.OneHotEncoder(cols=["weekday","listing_type","shipping_logistic_type","shipping_payment"], use_cat_names=True)
train = encoder.fit_transform(train)
train

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,q_mean,q_std
0,464801,2021-02-01,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,1,0,0,0,0,0,0,0.254237,0.562482
1,464801,2021-02-02,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,1,0,0,0,0,0,0.254237,0.562482
2,464801,2021-02-03,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,0,1,0,0,0,0,0.254237,0.562482
3,464801,2021-02-04,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,1,0,0,0,0.254237,0.562482
4,464801,2021-02-05,1,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,0,1,0,0,0.254237,0.562482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,1,0,0,0,1,1,0,267.710767,0,0,1,0,0,0,0,0.000000,0.000000
37660275,6707,2021-03-31,0,26999.00,ARG,1,0,0,1,0,1,0,266.083333,0,0,1,0,0,0,0,0.000000,0.000000
37660276,170355,2021-03-31,0,3400.00,ARG,1,0,0,0,1,0,1,0.252633,0,0,1,0,0,0,0,0.000000,0.000000
37660277,246568,2021-03-31,0,6289.00,ARG,1,0,1,0,0,1,0,135.416667,0,0,1,0,0,0,0,0.000000,0.000000


In [11]:
test = pd.read_csv(f"{input_path}/test_data.csv")
train = pd.merge(train, test, how="inner", on="sku")
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,q_mean,q_std,target_stock
0,464801,2021-02-01,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,1,0,0,0,0,0,0,0.254237,0.562482,3
1,464801,2021-02-02,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,1,0,0,0,0,0,0.254237,0.562482,3
2,464801,2021-02-03,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,0,1,0,0,0,0,0.254237,0.562482,3
3,464801,2021-02-04,0,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,1,0,0,0,0.254237,0.562482,3
4,464801,2021-02-05,1,156.78,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,0,1,0,0,0.254237,0.562482,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31284422,129187,2021-03-31,0,22057.00,ARG,1,0,0,0,1,1,0,267.710767,0,0,1,0,0,0,0,0.000000,0.000000,8
31284423,6707,2021-03-31,0,26999.00,ARG,1,0,0,1,0,1,0,266.083333,0,0,1,0,0,0,0,0.000000,0.000000,30
31284424,170355,2021-03-31,0,3400.00,ARG,1,0,0,0,1,0,1,0.252633,0,0,1,0,0,0,0,0.000000,0.000000,3
31284425,246568,2021-03-31,0,6289.00,ARG,1,0,1,0,0,1,0,135.416667,0,0,1,0,0,0,0,0.000000,0.000000,2


In [None]:
cols_sum = [
    "listing_type_classic",
    "listing_type_premium",
    "shipping_logistic_type_fulfillment",
    "shipping_logistic_type_cross_docking",
    "shipping_logistic_type_drop_off",
    "shipping_payment_free_shipping",
    "shipping_payment_paid_shipping",
    "weekday_1",
    "weekday_2",
    "weekday_3",
    "weekday_4",
    "weekday_5",
    "weekday_6",
    "weekday_7",    
]
cols_mean = [
    "minutes_active",
]

def compute_features(df):
    target_stock = df.target_stock.values[0]
    start_index = df.index.min()
    end_index = df.index.max()

    if len(df) < 30:
        return None

    inventory_days = list()
    sum_values = list()
    mean_values = list()

    for i in df.index:
        if i+29 > end_index: break

        df_slice = df.loc[i:i+29]
        cumsum = df_slice.sold_quantity.values.cumsum()
        idxs = np.argwhere(cumsum >= target_stock)
        if len(idxs)==0:
            idays = np.inf
        else:
            idays = np.min(idxs) + 1

        inventory_days.append(idays)
        sum_values.append(df_slice[cols_sum].sum(axis=0).values)
        mean_values.append(df_slice[cols_mean].mean(axis=0).values)

    df = df.head(len(inventory_days)).copy()
    df["inventory_days"] = inventory_days
    df[cols_sum] = np.asarray(sum_values)
    df[cols_mean] = np.asarray(mean_values)
    
    return df

with Parallel(n_jobs=-1) as parallel:
    delayed_func = delayed(compute_features)
    all_dfs = parallel(delayed_func(df) for _,df in tqdm(train.groupby("sku")))

 92%|█████████▏| 506685/551472 [41:41<1:09:21, 10.76it/s]

In [None]:
train = pd.concat(filter(lambda x: x is not None, all_dfs), ignore_index=True)
train

In [None]:
# todo: impute zero q_mean & q_std with stats from item_domain_id

 23%|██▎       | 125784/551472 [44:41<2:42:12, 43.74it/s]

In [None]:
all_records = list()
with jsonlines.open('../data/items_static_metadata_full.jl') as reader:
    for obj in tqdm(reader):
        all_records.append(obj)
        
metadata = pd.DataFrame(all_records)
metadata.drop(["item_title"], axis=1, inplace=True)
metadata

In [None]:
train = pd.merge(train, metadata, how="inner", on="sku")
train

In [None]:
train = reduce_mem_usage(train, verbose=True)
train.to_parquet("../data/train-m2.parquet", index=False)

***
## preparing test data

In [None]:
cols = ["sku","current_price", "currency", "listing_type", "shipping_logistic_type", 
        "shipping_payment", "minutes_active", "available", "item_domain_id", 
        "item_id", "site_id", "product_id", "product_family_id"]

def get_train_feats(df, look_back=7):
    row = dict()
    for col in cols:
        counts = df.tail(look_back).loc[:,col].value_counts()
        if len(counts) == 0:
            row[col] = None 
        else:
            row[col] = counts.index[0]  
    return row

In [None]:
all_rows = list()
    
with Parallel(n_jobs=6) as parallel:
    delayed_func = delayed(get_train_feats)
    all_rows = parallel(delayed_func(df) for _,df in tqdm(train.groupby("sku")))

train_feats = pd.DataFrame(all_rows)
train_feats

In [None]:
test = pd.read_csv("../data/test_data.csv")
test = pd.merge(test, train_feats, how="left", on="sku")
test

In [None]:
all_dfs = list()

for date in pd.date_range("2021-04-01", "2021-04-30"):
    _test = test.copy(deep=True)
    _test["date"] = date
    all_dfs.append(_test)
    
test = pd.concat(all_dfs, ignore_index=True)
test

In [None]:
test = reduce_mem_usage(test, verbose=True)
test.to_parquet("../data/test-m2.parquet", index=False)

***