**todo:**
- currency conversion
- fix skus with usd as currency

In [1]:
import numpy as np
import pandas as pd
import jsonlines
from tqdm import tqdm
from joblib import Parallel, delayed

***
## preparing train data

In [2]:
train = pd.read_parquet("../data/train_data.parquet")
train["date"] = pd.to_datetime(train["date"])
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000
...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667


In [3]:
all_records = list()
with jsonlines.open('../data/items_static_metadata_full.jl') as reader:
    for obj in tqdm(reader):
        all_records.append(obj)
        
metadata = pd.DataFrame(all_records)
metadata.drop(["item_title"], axis=1, inplace=True)
metadata.to_csv("../data/metadata.csv", index=False)
metadata

660916it [00:03, 179466.05it/s]


Unnamed: 0,item_domain_id,item_id,site_id,sku,product_id,product_family_id
0,MLB-SNEAKERS,492155,MLB,0,,MLB15832732
1,MLB-SURFBOARD_RACKS,300279,MLB,1,,
2,MLM-NECKLACES,69847,MLM,2,,
3,MLM-RINGS,298603,MLM,3,,
4,MLB-WEBCAMS,345949,MLB,4,,
...,...,...,...,...,...,...
660911,MLB-CELLPHONE_PARTS,320792,MLB,660911,,
660912,MLM-AUTOMOTIVE_EMBLEMS,90441,MLM,660912,,
660913,MLB-SOFA_AND_FUTON_COVERS,202580,MLB,660913,,
660914,MLB-SNEAKERS,490874,MLB,660914,,


In [4]:
train = pd.merge(train, metadata, how="inner", on="sku")
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767,MLA-RADIO_BASE_STATIONS,408559,MLA,,
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333,MLA-PRINTERS,193784,MLA,MLA15159034,MLA15159032
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633,MLA-WRISTWATCHES,110276,MLA,,
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667,MLA-HARD_DRIVES_AND_SSDS,456892,MLA,MLA15697725,MLA15697724


***
## preparing test data

In [5]:
cols = [
    "sku","current_price", "currency", "listing_type", 
    "shipping_logistic_type", "shipping_payment", "minutes_active", 
    "item_domain_id", "item_id", "site_id", "product_id",
    "product_family_id"
]

def get_train_feats(df, look_back=28):
    record = df.tail(look_back)[cols].mode()
    return record.to_dict(orient="records")[0]

In [6]:
with Parallel(n_jobs=6) as parallel:
    delayed_func = delayed(get_train_feats)
    all_rows = parallel(delayed_func(df) for _,df in tqdm(train.groupby("sku")))
train_feats = pd.DataFrame(all_rows)

train_feats["sku"] = train_feats["sku"].astype(int)
train_feats = train_feats.replace({np.nan: None})

# risky assumption
train_feats["minutes_active"] = 1440.

train_feats

100%|██████████| 660916/660916 [30:13<00:00, 364.48it/s] 


Unnamed: 0,sku,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id
0,0,179.99,REA,classic,fulfillment,free_shipping,1440.0,MLB-SNEAKERS,492155.0,MLB,,MLB15832732
1,1,135.90,REA,premium,fulfillment,free_shipping,1440.0,MLB-SURFBOARD_RACKS,300279.0,MLB,,
2,2,219.00,MEX,premium,drop_off,paid_shipping,1440.0,MLM-NECKLACES,69847.0,MLM,,
3,3,399.00,MEX,premium,fulfillment,free_shipping,1440.0,MLM-RINGS,298603.0,MLM,,
4,4,118.00,REA,premium,fulfillment,free_shipping,1440.0,MLB-WEBCAMS,345949.0,MLB,,
...,...,...,...,...,...,...,...,...,...,...,...,...
660911,660911,12.90,REA,classic,fulfillment,paid_shipping,1440.0,MLB-CELLPHONE_PARTS,320792.0,MLB,,
660912,660912,99.00,MEX,classic,fulfillment,paid_shipping,1440.0,MLM-AUTOMOTIVE_EMBLEMS,90441.0,MLM,,
660913,660913,128.08,REA,premium,fulfillment,free_shipping,1440.0,MLB-SOFA_AND_FUTON_COVERS,202580.0,MLB,,
660914,660914,52.99,REA,premium,fulfillment,paid_shipping,1440.0,MLB-SNEAKERS,490874.0,MLB,,


In [7]:
train_feats.isna().sum(axis=0) / len(train_feats)

sku                       0.000000
current_price             0.000000
currency                  0.000000
listing_type              0.000000
shipping_logistic_type    0.000000
shipping_payment          0.000000
minutes_active            0.000000
item_domain_id            0.000005
item_id                   0.000000
site_id                   0.000000
product_id                0.955111
product_family_id         0.874138
dtype: float64

In [8]:
test = pd.read_csv("../data/test_data.csv")
test = pd.merge(test, train_feats, how="left", on="sku")
test

Unnamed: 0,sku,target_stock,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id
0,464801,3,169.99,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,344151.0,MLB,MLB9838512,MLB9838510
1,645793,4,164.99,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,438135.0,MLB,,MLB9838510
2,99516,8,22.90,REA,premium,fulfillment,paid_shipping,1440.0,MLB-ADHESIVE_TAPES,221252.0,MLB,,
3,538100,8,28.10,REA,premium,fulfillment,paid_shipping,1440.0,MLB-SCHOOL_AND_OFFICE_GLUES,62099.0,MLB,,
4,557191,10,49.90,REA,premium,fulfillment,paid_shipping,1440.0,MLB-DECORATIVE_VINYLS,168198.0,MLB,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
551467,129187,8,22057.00,ARG,classic,drop_off,free_shipping,1440.0,MLA-RADIO_BASE_STATIONS,408559.0,MLA,,
551468,6707,30,26999.00,ARG,classic,cross_docking,free_shipping,1440.0,MLA-PRINTERS,193784.0,MLA,MLA15159034,MLA15159032
551469,170355,3,3400.00,ARG,classic,drop_off,paid_shipping,1440.0,MLA-WRISTWATCHES,110276.0,MLA,,
551470,246568,2,6289.00,ARG,classic,fulfillment,free_shipping,1440.0,MLA-HARD_DRIVES_AND_SSDS,456892.0,MLA,MLA15697725,MLA15697724


In [9]:
all_dfs = list()

for date in pd.date_range("2021-04-01", "2021-04-30"):
    _test = test.copy(deep=True)
    _test["date"] = date
    all_dfs.append(_test)
    
test = pd.concat(all_dfs, ignore_index=True)
test

Unnamed: 0,sku,target_stock,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id,date
0,464801,3,169.99,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,344151.0,MLB,MLB9838512,MLB9838510,2021-04-01
1,645793,4,164.99,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,438135.0,MLB,,MLB9838510,2021-04-01
2,99516,8,22.90,REA,premium,fulfillment,paid_shipping,1440.0,MLB-ADHESIVE_TAPES,221252.0,MLB,,,2021-04-01
3,538100,8,28.10,REA,premium,fulfillment,paid_shipping,1440.0,MLB-SCHOOL_AND_OFFICE_GLUES,62099.0,MLB,,,2021-04-01
4,557191,10,49.90,REA,premium,fulfillment,paid_shipping,1440.0,MLB-DECORATIVE_VINYLS,168198.0,MLB,,,2021-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16544155,129187,8,22057.00,ARG,classic,drop_off,free_shipping,1440.0,MLA-RADIO_BASE_STATIONS,408559.0,MLA,,,2021-04-30
16544156,6707,30,26999.00,ARG,classic,cross_docking,free_shipping,1440.0,MLA-PRINTERS,193784.0,MLA,MLA15159034,MLA15159032,2021-04-30
16544157,170355,3,3400.00,ARG,classic,drop_off,paid_shipping,1440.0,MLA-WRISTWATCHES,110276.0,MLA,,,2021-04-30
16544158,246568,2,6289.00,ARG,classic,fulfillment,free_shipping,1440.0,MLA-HARD_DRIVES_AND_SSDS,456892.0,MLA,MLA15697725,MLA15697724,2021-04-30


***
## saving the results

In [10]:
import sys
sys.path.append("../utils")
from memory import reduce_mem_usage

In [11]:
train = reduce_mem_usage(train, verbose=True)
train.to_parquet("../data/train-m1.parquet", index=False)

Mem. usage decreased to 3519.73 Mb (18.3% reduction)


In [12]:
test = reduce_mem_usage(test, verbose=True)
test.to_parquet("../data/test-m1.parquet", index=False)

Mem. usage decreased to 1420.00 Mb (19.6% reduction)


***