In [1]:
import numpy as np
import pandas as pd
import jsonlines
from tqdm import tqdm
from joblib import Parallel, delayed

***
## preparing train data

In [2]:
train = pd.read_parquet("../data/train_data.parquet")
train["date"] = pd.to_datetime(train["date"])
train["available"] = 1
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767,1
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333,1
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633,1
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667,1


In [3]:
def fill_empty(df):
    df = pd.merge(date_range, df, how="left", on="date")
    df["sold_quantity"] = df.sold_quantity.fillna(0).astype(int)
    df["minutes_active"] = df.sold_quantity.fillna(0).astype(int)
    df["available"] = df.available.fillna(0).astype(int)
    df[cols_to_fill] = df[cols_to_fill].fillna(method="ffill").fillna(method="bfill")
    df["sku"] = df["sku"].astype(int)
    df["sold_quantity"] = df["sold_quantity"].astype(int)
    df["available"] = df["available"].astype(int)
    return df

In [4]:
# fills with missing dates
date_range = pd.DataFrame(pd.date_range("2021-02-01", "2021-03-31"), columns=["date"])

cols_to_fill = [
    "sku", "current_price", "currency", "listing_type", 
    "shipping_logistic_type", "shipping_payment",
]

with Parallel(n_jobs=-1) as parallel:
    delayed_func = delayed(fill_empty)
    all_dfs = parallel(delayed_func(df) for _,df in tqdm(train.groupby("sku")))
    
train = pd.concat(all_dfs, ignore_index=True)
train

100%|██████████| 660916/660916 [29:56<00:00, 367.84it/s]  


Unnamed: 0,date,sku,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available
0,2021-02-01,0,0,172.79,REA,classic,cross_docking,free_shipping,0,1
1,2021-02-02,0,0,172.79,REA,classic,cross_docking,free_shipping,0,1
2,2021-02-03,0,0,179.99,REA,classic,cross_docking,free_shipping,0,1
3,2021-02-04,0,0,179.99,REA,classic,cross_docking,free_shipping,0,1
4,2021-02-05,0,0,179.99,REA,classic,cross_docking,free_shipping,0,1
...,...,...,...,...,...,...,...,...,...,...
38994039,2021-03-27,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0,1
38994040,2021-03-28,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0,1
38994041,2021-03-29,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0,1
38994042,2021-03-30,660915,0,99.99,MEX,classic,fulfillment,paid_shipping,0,1


In [5]:
all_records = list()
with jsonlines.open('../data/items_static_metadata_full.jl') as reader:
    for obj in tqdm(reader):
        all_records.append(obj)
        
metadata = pd.DataFrame(all_records)
metadata.drop(["item_title"], axis=1, inplace=True)
metadata

660916it [00:06, 105300.84it/s]


Unnamed: 0,item_domain_id,item_id,site_id,sku,product_id,product_family_id
0,MLB-SNEAKERS,492155,MLB,0,,MLB15832732
1,MLB-SURFBOARD_RACKS,300279,MLB,1,,
2,MLM-NECKLACES,69847,MLM,2,,
3,MLM-RINGS,298603,MLM,3,,
4,MLB-WEBCAMS,345949,MLB,4,,
...,...,...,...,...,...,...
660911,MLB-CELLPHONE_PARTS,320792,MLB,660911,,
660912,MLM-AUTOMOTIVE_EMBLEMS,90441,MLM,660912,,
660913,MLB-SOFA_AND_FUTON_COVERS,202580,MLB,660913,,
660914,MLB-SNEAKERS,490874,MLB,660914,,


In [6]:
train = pd.merge(train, metadata, how="inner", on="sku")
train

Unnamed: 0,date,sku,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id
0,2021-02-01,0,0,172.79,REA,classic,cross_docking,free_shipping,0,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
1,2021-02-02,0,0,172.79,REA,classic,cross_docking,free_shipping,0,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
2,2021-02-03,0,0,179.99,REA,classic,cross_docking,free_shipping,0,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
3,2021-02-04,0,0,179.99,REA,classic,cross_docking,free_shipping,0,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
4,2021-02-05,0,0,179.99,REA,classic,cross_docking,free_shipping,0,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38994039,2021-03-27,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994040,2021-03-28,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994041,2021-03-29,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994042,2021-03-30,660915,0,99.99,MEX,classic,fulfillment,paid_shipping,0,1,MLM-SURGICAL_MASKS,423179,MLM,,


***
## preparing test data

In [28]:
cols = ["sku","current_price", "currency", "listing_type", "shipping_logistic_type", 
        "shipping_payment", "minutes_active", "available", "item_domain_id", 
        "item_id", "site_id", "product_id", "product_family_id"]

def get_train_feats(df, look_back=14):
    return df.tail(look_back).loc[:,cols].mode(axis=0)

In [29]:
all_rows = list()
    
with Parallel(n_jobs=-1) as parallel:
    delayed_func = delayed(get_train_feats)
    all_rows = parallel(delayed_func(df) for _,df in tqdm(train.groupby("sku")))

train_feats = pd.concat(all_rows, ignore_index=True)

100%|██████████| 660916/660916 [22:13<00:00, 495.62it/s]  


In [31]:
test = pd.read_csv("../data/test_data.csv")
test = pd.merge(test, train_feats, how="left", on="sku")
test

Unnamed: 0,sku,target_stock,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id
0,464801,3,169.99,REA,classic,fulfillment,free_shipping,0.0,1.0,MLB-NEBULIZERS,344151.0,MLB,MLB9838512,MLB9838510
1,645793,4,164.99,REA,classic,fulfillment,free_shipping,0.0,1.0,MLB-NEBULIZERS,438135.0,MLB,,MLB9838510
2,99516,8,22.90,REA,premium,fulfillment,paid_shipping,0.0,1.0,MLB-ADHESIVE_TAPES,221252.0,MLB,,
3,538100,8,28.10,REA,premium,fulfillment,paid_shipping,0.0,1.0,MLB-SCHOOL_AND_OFFICE_GLUES,62099.0,MLB,,
4,557191,10,49.90,REA,premium,fulfillment,paid_shipping,0.0,1.0,MLB-DECORATIVE_VINYLS,168198.0,MLB,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551467,129187,8,22057.00,ARG,classic,drop_off,free_shipping,0.0,0.0,MLA-RADIO_BASE_STATIONS,408559.0,MLA,,
551468,6707,30,26999.00,ARG,classic,cross_docking,free_shipping,0.0,0.0,MLA-PRINTERS,193784.0,MLA,MLA15159034,MLA15159032
551469,170355,3,3400.00,ARG,classic,drop_off,paid_shipping,0.0,0.0,MLA-WRISTWATCHES,110276.0,MLA,,
551470,246568,2,6289.00,ARG,classic,fulfillment,free_shipping,0.0,0.0,MLA-HARD_DRIVES_AND_SSDS,456892.0,MLA,MLA15697725,MLA15697724


In [51]:
all_dfs = list()

for date in pd.date_range("2021-04-01", "2021-04-30"):
    _test = test.copy(deep=True)
    _test["date"] = date
    all_dfs.append(_test)
    
test = pd.concat(all_dfs, ignore_index=True)
test

Unnamed: 0,sku,target_stock,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id,date
0,464801,3,169.990005,REA,classic,fulfillment,free_shipping,0.0,1.0,MLB-NEBULIZERS,344151.0,MLB,MLB9838512,MLB9838510,2021-04-01
1,645793,4,164.990005,REA,classic,fulfillment,free_shipping,0.0,1.0,MLB-NEBULIZERS,438135.0,MLB,,MLB9838510,2021-04-01
2,99516,8,22.900000,REA,premium,fulfillment,paid_shipping,0.0,1.0,MLB-ADHESIVE_TAPES,221252.0,MLB,,,2021-04-01
3,538100,8,28.100000,REA,premium,fulfillment,paid_shipping,0.0,1.0,MLB-SCHOOL_AND_OFFICE_GLUES,62099.0,MLB,,,2021-04-01
4,557191,10,49.900002,REA,premium,fulfillment,paid_shipping,0.0,1.0,MLB-DECORATIVE_VINYLS,168198.0,MLB,,,2021-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16544155,129187,8,22057.000000,ARG,classic,drop_off,free_shipping,0.0,0.0,MLA-RADIO_BASE_STATIONS,408559.0,MLA,,,2021-04-30
16544156,6707,30,26999.000000,ARG,classic,cross_docking,free_shipping,0.0,0.0,MLA-PRINTERS,193784.0,MLA,MLA15159034,MLA15159032,2021-04-30
16544157,170355,3,3400.000000,ARG,classic,drop_off,paid_shipping,0.0,0.0,MLA-WRISTWATCHES,110276.0,MLA,,,2021-04-30
16544158,246568,2,6289.000000,ARG,classic,fulfillment,free_shipping,0.0,0.0,MLA-HARD_DRIVES_AND_SSDS,456892.0,MLA,MLA15697725,MLA15697724,2021-04-30


***
## saving the results

In [52]:
def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [53]:
train = reduce_mem_usage(train, verbose=True)
train.to_parquet("../data/train.parquet", index=False)

Mem. usage decreased to 3607.20 Mb (0.0% reduction)


In [54]:
test = reduce_mem_usage(test, verbose=True)
test.to_parquet("../data/test.parquet", index=False)

Mem. usage decreased to 1483.11 Mb (0.0% reduction)


***