In [1]:
import numpy as np
import pandas as pd
import jsonlines
from tqdm import tqdm

***

In [2]:
train = pd.read_parquet("../data/train_data.parquet")
train["date"] = pd.to_datetime(train["date"])
train["available"] = 1
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767,1
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333,1
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633,1
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667,1


In [3]:
# fills with missing dates
date_range = pd.DataFrame(pd.date_range("2021-02-01", "2021-03-31"), columns=["date"])
all_dfs = list()

cols_to_fill = [
    "sku", "current_price", "currency", "listing_type", 
    "shipping_logistic_type", "shipping_payment", "minutes_active"
]

for sku,df in tqdm(train.groupby("sku")):
    df = pd.merge(date_range, df, how="left", on="date")
    df["sold_quantity"] = df.sold_quantity.fillna(0).astype(int)
    df["available"] = df.available.fillna(0).astype(int)
    df[cols_to_fill] = df[cols_to_fill].fillna(method="ffill").fillna(method="bfill")
    df["sku"] = df["sku"].astype(int)
    df["sold_quantity"] = df["sold_quantity"].astype(int)
    df["available"] = df["available"].astype(int)
    all_dfs.append(df)
    
train = pd.concat(all_dfs, ignore_index=True)
train

100%|██████████| 660916/660916 [1:16:58<00:00, 143.09it/s] 


Unnamed: 0,date,sku,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available
0,2021-02-01,0,0,172.79,REA,classic,cross_docking,free_shipping,0.00,1
1,2021-02-02,0,0,172.79,REA,classic,cross_docking,free_shipping,0.00,1
2,2021-02-03,0,0,179.99,REA,classic,cross_docking,free_shipping,872.65,1
3,2021-02-04,0,0,179.99,REA,classic,cross_docking,free_shipping,1440.00,1
4,2021-02-05,0,0,179.99,REA,classic,cross_docking,free_shipping,1440.00,1
...,...,...,...,...,...,...,...,...,...,...
38994039,2021-03-27,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0.00,1
38994040,2021-03-28,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0.00,1
38994041,2021-03-29,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0.00,1
38994042,2021-03-30,660915,0,99.99,MEX,classic,fulfillment,paid_shipping,0.00,1


In [4]:
all_records = list()
with jsonlines.open('../data/items_static_metadata_full.jl') as reader:
    for obj in tqdm(reader):
        all_records.append(obj)
        
metadata = pd.DataFrame(all_records)
metadata.drop(["item_title"], axis=1, inplace=True)
metadata

660916it [00:04, 142379.75it/s]


Unnamed: 0,item_domain_id,item_id,site_id,sku,product_id,product_family_id
0,MLB-SNEAKERS,492155,MLB,0,,MLB15832732
1,MLB-SURFBOARD_RACKS,300279,MLB,1,,
2,MLM-NECKLACES,69847,MLM,2,,
3,MLM-RINGS,298603,MLM,3,,
4,MLB-WEBCAMS,345949,MLB,4,,
...,...,...,...,...,...,...
660911,MLB-CELLPHONE_PARTS,320792,MLB,660911,,
660912,MLM-AUTOMOTIVE_EMBLEMS,90441,MLM,660912,,
660913,MLB-SOFA_AND_FUTON_COVERS,202580,MLB,660913,,
660914,MLB-SNEAKERS,490874,MLB,660914,,


In [5]:
dataset = pd.merge(train, metadata, how="inner", on="sku")
dataset

Unnamed: 0,date,sku,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id
0,2021-02-01,0,0,172.79,REA,classic,cross_docking,free_shipping,0.00,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
1,2021-02-02,0,0,172.79,REA,classic,cross_docking,free_shipping,0.00,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
2,2021-02-03,0,0,179.99,REA,classic,cross_docking,free_shipping,872.65,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
3,2021-02-04,0,0,179.99,REA,classic,cross_docking,free_shipping,1440.00,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
4,2021-02-05,0,0,179.99,REA,classic,cross_docking,free_shipping,1440.00,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38994039,2021-03-27,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0.00,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994040,2021-03-28,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0.00,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994041,2021-03-29,660915,0,79.99,MEX,classic,fulfillment,paid_shipping,0.00,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994042,2021-03-30,660915,0,99.99,MEX,classic,fulfillment,paid_shipping,0.00,1,MLM-SURGICAL_MASKS,423179,MLM,,


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38994044 entries, 0 to 38994043
Data columns (total 15 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   date                    datetime64[ns]
 1   sku                     int64         
 2   sold_quantity           int64         
 3   current_price           float64       
 4   currency                object        
 5   listing_type            object        
 6   shipping_logistic_type  object        
 7   shipping_payment        object        
 8   minutes_active          float64       
 9   available               int64         
 10  item_domain_id          object        
 11  item_id                 int64         
 12  site_id                 object        
 13  product_id              object        
 14  product_family_id       object        
dtypes: datetime64[ns](1), float64(2), int64(4), object(8)
memory usage: 4.6+ GB


***

In [7]:
def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
dataset = reduce_mem_usage(dataset, verbose=True)
dataset.to_parquet("../data/dataset.parquet", index=False)

Mem. usage decreased to 3681.57 Mb (22.7% reduction)


***