In [1]:
import numpy as np
import pandas as pd
import jsonlines
from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
# mean over the period of train+test
MEX_TO_USD = 0.050
ARG_TO_USD = 0.011
REA_TO_USD = 0.18

***
## preparing train data

In [3]:
train = pd.read_parquet("../data/train_data.parquet")
train["date"] = pd.to_datetime(train["date"])
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000
...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667


In [4]:
idx = train.query("currency == 'ARG'").index
train.loc[idx,"current_price"] = train.loc[idx,"current_price"]*ARG_TO_USD

idx = train.query("currency == 'REA'").index
train.loc[idx,"current_price"] = train.loc[idx,"current_price"]*REA_TO_USD

idx = train.query("currency == 'MEX'").index
train.loc[idx,"current_price"] = train.loc[idx,"current_price"]*MEX_TO_USD

# issue with particular gradient
train.loc[29044628,"current_price"] = train.loc[29044627,"current_price"]

train.drop("currency", axis=1, inplace=True)

In [5]:
all_records = list()
with jsonlines.open('../data/items_static_metadata_full.jl') as reader:
    for obj in tqdm(reader):
        all_records.append(obj)
        
metadata = pd.DataFrame(all_records)
metadata.drop(["item_title"], axis=1, inplace=True)

def fix_item_domain_id(x):
    try:
        return x.split("-")[1]
    except:
        return None

def fix_product_id(x):
    try:
        return x[3:]
    except:
        return None
    
def fix_product_family_id(x):
    try:
        return x[3:]
    except:
        return None

metadata["item_domain_id_glob"] = metadata.item_domain_id.apply(fix_item_domain_id)
metadata["product_id_glob"] = metadata.product_id.apply(fix_product_id)
metadata["product_family_id_glob"] = metadata.product_family_id.apply(fix_product_family_id)

metadata.to_csv("../data/metadata.csv", index=False)
metadata

660916it [00:03, 195861.55it/s]


Unnamed: 0,item_domain_id,item_id,site_id,sku,product_id,product_family_id,item_domain_id_glob,product_id_glob,product_family_id_glob
0,MLB-SNEAKERS,492155,MLB,0,,MLB15832732,SNEAKERS,,15832732
1,MLB-SURFBOARD_RACKS,300279,MLB,1,,,SURFBOARD_RACKS,,
2,MLM-NECKLACES,69847,MLM,2,,,NECKLACES,,
3,MLM-RINGS,298603,MLM,3,,,RINGS,,
4,MLB-WEBCAMS,345949,MLB,4,,,WEBCAMS,,
...,...,...,...,...,...,...,...,...,...
660911,MLB-CELLPHONE_PARTS,320792,MLB,660911,,,CELLPHONE_PARTS,,
660912,MLM-AUTOMOTIVE_EMBLEMS,90441,MLM,660912,,,AUTOMOTIVE_EMBLEMS,,
660913,MLB-SOFA_AND_FUTON_COVERS,202580,MLB,660913,,,SOFA_AND_FUTON_COVERS,,
660914,MLB-SNEAKERS,490874,MLB,660914,,,SNEAKERS,,


In [6]:
metadata.nunique(axis=0)

item_domain_id              8408
item_id                   517896
site_id                        3
sku                       660916
product_id                 15863
product_family_id          29600
item_domain_id_glob         3595
product_id_glob            14631
product_family_id_glob     27272
dtype: int64

In [7]:
train = pd.merge(train, metadata, how="inner", on="sku")
train

Unnamed: 0,sku,date,sold_quantity,current_price,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id,item_domain_id_glob,product_id_glob,product_family_id_glob
0,464801,2021-02-01,0,28.2204,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510,NEBULIZERS,9838512,9838510
1,464801,2021-02-02,0,28.2204,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510,NEBULIZERS,9838512,9838510
2,464801,2021-02-03,0,28.2204,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510,NEBULIZERS,9838512,9838510
3,464801,2021-02-04,0,28.2204,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510,NEBULIZERS,9838512,9838510
4,464801,2021-02-05,1,28.2204,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510,NEBULIZERS,9838512,9838510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,242.6270,classic,drop_off,free_shipping,267.710767,MLA-RADIO_BASE_STATIONS,408559,MLA,,,RADIO_BASE_STATIONS,,
37660275,6707,2021-03-31,0,296.9890,classic,cross_docking,free_shipping,266.083333,MLA-PRINTERS,193784,MLA,MLA15159034,MLA15159032,PRINTERS,15159034,15159032
37660276,170355,2021-03-31,0,37.4000,classic,drop_off,paid_shipping,0.252633,MLA-WRISTWATCHES,110276,MLA,,,WRISTWATCHES,,
37660277,246568,2021-03-31,0,69.1790,classic,fulfillment,free_shipping,135.416667,MLA-HARD_DRIVES_AND_SSDS,456892,MLA,MLA15697725,MLA15697724,HARD_DRIVES_AND_SSDS,15697725,15697724


***
## preparing test data

In [8]:
cols = [
    "sku","current_price", "listing_type", 
    "shipping_logistic_type", "shipping_payment", "minutes_active", 
    "item_domain_id", "item_id", "site_id", "product_id",
    "product_family_id", "item_domain_id_glob", "product_family_id_glob",
    "product_id_glob",
]

def get_train_feats(df, look_back=28):
    record = df.tail(look_back)[cols].mode()
    return record.to_dict(orient="records")[0]

In [9]:
with Parallel(n_jobs=6) as parallel:
    delayed_func = delayed(get_train_feats)
    all_rows = parallel(delayed_func(df) for _,df in tqdm(train.groupby("sku")))
train_feats = pd.DataFrame(all_rows)

train_feats["sku"] = train_feats["sku"].astype(int)
train_feats = train_feats.replace({np.nan: None})

train_feats

100%|██████████| 660916/660916 [27:41<00:00, 397.82it/s]


Unnamed: 0,sku,current_price,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id,item_domain_id_glob,product_family_id_glob,product_id_glob
0,0,32.3982,classic,fulfillment,free_shipping,1440.0,MLB-SNEAKERS,492155.0,MLB,,MLB15832732,SNEAKERS,15832732,
1,1,24.4620,premium,fulfillment,free_shipping,1440.0,MLB-SURFBOARD_RACKS,300279.0,MLB,,,SURFBOARD_RACKS,,
2,2,10.9500,premium,drop_off,paid_shipping,1440.0,MLM-NECKLACES,69847.0,MLM,,,NECKLACES,,
3,3,19.9500,premium,fulfillment,free_shipping,0.0,MLM-RINGS,298603.0,MLM,,,RINGS,,
4,4,21.2400,premium,fulfillment,free_shipping,0.0,MLB-WEBCAMS,345949.0,MLB,,,WEBCAMS,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660911,660911,2.3220,classic,fulfillment,paid_shipping,1440.0,MLB-CELLPHONE_PARTS,320792.0,MLB,,,CELLPHONE_PARTS,,
660912,660912,4.9500,classic,fulfillment,paid_shipping,1440.0,MLM-AUTOMOTIVE_EMBLEMS,90441.0,MLM,,,AUTOMOTIVE_EMBLEMS,,
660913,660913,23.0544,premium,fulfillment,free_shipping,1440.0,MLB-SOFA_AND_FUTON_COVERS,202580.0,MLB,,,SOFA_AND_FUTON_COVERS,,
660914,660914,9.5382,premium,fulfillment,paid_shipping,1440.0,MLB-SNEAKERS,490874.0,MLB,,,SNEAKERS,,


In [10]:
train_feats.isna().sum(axis=0) / len(train_feats)

sku                       0.000000
current_price             0.000000
listing_type              0.000000
shipping_logistic_type    0.000000
shipping_payment          0.000000
minutes_active            0.000000
item_domain_id            0.000005
item_id                   0.000000
site_id                   0.000000
product_id                0.955111
product_family_id         0.874138
item_domain_id_glob       0.000005
product_family_id_glob    0.874138
product_id_glob           0.955111
dtype: float64

In [11]:
test = pd.read_csv("../data/test_data.csv")
test = pd.merge(test, train_feats, how="left", on="sku")
test

Unnamed: 0,sku,target_stock,current_price,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id,item_domain_id_glob,product_family_id_glob,product_id_glob
0,464801,3,30.5982,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151.0,MLB,MLB9838512,MLB9838510,NEBULIZERS,9838510,9838512
1,645793,4,29.6982,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,438135.0,MLB,,MLB9838510,NEBULIZERS,9838510,
2,99516,8,4.1220,premium,fulfillment,paid_shipping,1440.000000,MLB-ADHESIVE_TAPES,221252.0,MLB,,,ADHESIVE_TAPES,,
3,538100,8,5.0580,premium,fulfillment,paid_shipping,1440.000000,MLB-SCHOOL_AND_OFFICE_GLUES,62099.0,MLB,,,SCHOOL_AND_OFFICE_GLUES,,
4,557191,10,8.9820,premium,fulfillment,paid_shipping,0.000000,MLB-DECORATIVE_VINYLS,168198.0,MLB,,,DECORATIVE_VINYLS,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551467,129187,8,242.6270,classic,drop_off,free_shipping,267.710767,MLA-RADIO_BASE_STATIONS,408559.0,MLA,,,RADIO_BASE_STATIONS,,
551468,6707,30,296.9890,classic,cross_docking,free_shipping,266.083333,MLA-PRINTERS,193784.0,MLA,MLA15159034,MLA15159032,PRINTERS,15159032,15159034
551469,170355,3,37.4000,classic,drop_off,paid_shipping,0.252633,MLA-WRISTWATCHES,110276.0,MLA,,,WRISTWATCHES,,
551470,246568,2,69.1790,classic,fulfillment,free_shipping,135.416667,MLA-HARD_DRIVES_AND_SSDS,456892.0,MLA,MLA15697725,MLA15697724,HARD_DRIVES_AND_SSDS,15697724,15697725


In [12]:
all_dfs = list()

for date in pd.date_range("2021-04-01", "2021-04-30"):
    _test = test.copy(deep=True)
    _test["date"] = date
    all_dfs.append(_test)
    
test = pd.concat(all_dfs, ignore_index=True)
test

Unnamed: 0,sku,target_stock,current_price,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,site_id,product_id,product_family_id,item_domain_id_glob,product_family_id_glob,product_id_glob,date
0,464801,3,30.5982,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,344151.0,MLB,MLB9838512,MLB9838510,NEBULIZERS,9838510,9838512,2021-04-01
1,645793,4,29.6982,classic,fulfillment,free_shipping,1440.000000,MLB-NEBULIZERS,438135.0,MLB,,MLB9838510,NEBULIZERS,9838510,,2021-04-01
2,99516,8,4.1220,premium,fulfillment,paid_shipping,1440.000000,MLB-ADHESIVE_TAPES,221252.0,MLB,,,ADHESIVE_TAPES,,,2021-04-01
3,538100,8,5.0580,premium,fulfillment,paid_shipping,1440.000000,MLB-SCHOOL_AND_OFFICE_GLUES,62099.0,MLB,,,SCHOOL_AND_OFFICE_GLUES,,,2021-04-01
4,557191,10,8.9820,premium,fulfillment,paid_shipping,0.000000,MLB-DECORATIVE_VINYLS,168198.0,MLB,,,DECORATIVE_VINYLS,,,2021-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16544155,129187,8,242.6270,classic,drop_off,free_shipping,267.710767,MLA-RADIO_BASE_STATIONS,408559.0,MLA,,,RADIO_BASE_STATIONS,,,2021-04-30
16544156,6707,30,296.9890,classic,cross_docking,free_shipping,266.083333,MLA-PRINTERS,193784.0,MLA,MLA15159034,MLA15159032,PRINTERS,15159032,15159034,2021-04-30
16544157,170355,3,37.4000,classic,drop_off,paid_shipping,0.252633,MLA-WRISTWATCHES,110276.0,MLA,,,WRISTWATCHES,,,2021-04-30
16544158,246568,2,69.1790,classic,fulfillment,free_shipping,135.416667,MLA-HARD_DRIVES_AND_SSDS,456892.0,MLA,MLA15697725,MLA15697724,HARD_DRIVES_AND_SSDS,15697724,15697725,2021-04-30


***
## saving the results

In [13]:
import sys
sys.path.append("../utils")
from memory import reduce_mem_usage

In [14]:
train = reduce_mem_usage(train, verbose=True)
train.to_parquet("../data/train-m1.parquet", index=False)

Mem. usage decreased to 4094.38 Mb (16.2% reduction)


In [15]:
test = reduce_mem_usage(test, verbose=True)
test.to_parquet("../data/test-m1.parquet", index=False)

Mem. usage decreased to 1672.44 Mb (17.2% reduction)


***