***
## notebook config

In [1]:
ON_COLAB = False

In [2]:
if ON_COLAB:
    from google.colab import drive
    from google.colab import files
    drive.mount('/content/drive', force_remount=True)
    
    !pip install --upgrade kaggle > /dev/null 2>&1
    !mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
    
    !free -h
    
    !pip install --upgrade category_encoders > /dev/null 2>&1
    !pip install --upgrade jsonlines > /dev/null 2>&1
    !pip install --upgrade pandarallel > /dev/null 2>&1

In [3]:
if ON_COLAB:
    !kaggle datasets download -d mavillan/meli-2021 --force --unzip
    !ls -halt
    input_path = "."
    print("input_path:", input_path)
    output_path = "/content/drive/MyDrive/meli2021/data"
    print("output_path:", output_path)
else:
    input_path = "../data"
    print("input_path:", input_path)
    output_path = "../data"
    print("output_path:", output_path)

input_path: ../data
output_path: ../data


***

In [4]:
import gc
import numpy as np
import pandas as pd
import jsonlines
from tqdm import tqdm
from joblib import Parallel, delayed
import category_encoders as ce
from pandarallel import pandarallel

import sys
sys.path.append("../utils")
from memory import reduce_mem_usage

pd.set_option('display.max_columns', None)
pandarallel.initialize()

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


***
## preparing train data

In [5]:
train = pd.read_parquet(f"{input_path}/train_data.parquet")
train["date"] = pd.to_datetime(train["date"])
train["weekday"] = (train.date.dt.weekday+1).astype(str)
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,weekday
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,1
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,2
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,3
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000,4
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000,5
...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767,3
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333,3
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633,3
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667,3


In [6]:
metadata = pd.read_csv(f"{input_path}/metadata.csv")
metadata

Unnamed: 0,item_domain_id,item_id,site_id,sku,product_id,product_family_id
0,MLB-SNEAKERS,492155,MLB,0,,MLB15832732
1,MLB-SURFBOARD_RACKS,300279,MLB,1,,
2,MLM-NECKLACES,69847,MLM,2,,
3,MLM-RINGS,298603,MLM,3,,
4,MLB-WEBCAMS,345949,MLB,4,,
...,...,...,...,...,...,...
660911,MLB-CELLPHONE_PARTS,320792,MLB,660911,,
660912,MLM-AUTOMOTIVE_EMBLEMS,90441,MLM,660912,,
660913,MLB-SOFA_AND_FUTON_COVERS,202580,MLB,660913,,
660914,MLB-SNEAKERS,490874,MLB,660914,,


In [7]:
df = train[["sku","currency"]].drop_duplicates()

Unnamed: 0,sku,currency
0,464801,REA
59,645793,REA
118,77402,REA
177,58546,REA
236,99516,REA
...,...,...
37660274,129187,ARG
37660275,6707,ARG
37660276,170355,ARG
37660277,246568,ARG


In [15]:
df = df.reset_index(drop=True)

In [10]:
train.sku.nunique()

660916

In [13]:
df[df.duplicated(subset=["sku","currency"])]

Unnamed: 0,sku,currency


In [17]:
df.sku.value_counts()

1169      2
527447    2
464801    1
312677    1
622123    1
         ..
459981    1
6373      1
222746    1
167234    1
49718     1
Name: sku, Length: 660916, dtype: int64

In [18]:
df.query("sku == 1169")

Unnamed: 0,sku,currency
497579,1169,DOL
497580,1169,MEX


In [19]:
df.query("sku == 527447")

Unnamed: 0,sku,currency
503451,527447,MEX
503452,527447,DOL


In [22]:
train.query("currency == 'DOL'").sku.unique()

array([560040, 195517, 105701,   1169, 527447])

In [8]:
train["tmp"] = train.eval("sold_quantity*minutes_active")
q_mean = (train.groupby("sku")["tmp"].sum() / train.groupby("sku")["minutes_active"].sum()).reset_index(name="q_mean")
train = train.merge(q_mean, how="inner", on="sku")
train.drop("tmp", axis=1, inplace=True)

train["tmp"]  = train.eval("minutes_active * (sold_quantity - q_mean)**2")
q_std = (train.groupby("sku")["tmp"].sum() / train.groupby("sku")["minutes_active"].sum()).reset_index(name="q_std")
train = train.merge(q_std, how="inner", on="sku")
train.drop("tmp", axis=1, inplace=True)

In [9]:
# imputation of zero q_mean & q_std
train = pd.merge(train, metadata.loc[:,["sku","item_domain_id"]], how="inner", on="sku")

df_mean_imp = (
    train
    .loc[:,["sku","item_domain_id","q_mean","q_std"]]
    .drop_duplicates()
    .query("q_mean > 0")
    .groupby("item_domain_id")["q_mean"]
    .mean()
    .reset_index(name="q_mean_imp")
)
df_std_imp = (
    train
    .loc[:,["sku","item_domain_id","q_mean","q_std"]]
    .drop_duplicates()
    .query("q_std > 0")
    .groupby("item_domain_id")["q_std"]
    .median()
    .reset_index(name="q_std_imp")
)

train = (
    train
    .merge(df_mean_imp, how="left", on="item_domain_id")
    .merge(df_std_imp, how="left", on="item_domain_id")
)

idx = train[train.q_mean_imp.isna()].index
train.loc[idx, "q_mean_imp"] = df_mean_imp.q_mean_imp.median()
idx = train[train.q_std_imp.isna()].index
train.loc[idx, "q_std_imp"] = df_std_imp.q_std_imp.median()

idx = train[train.q_mean == 0].index
train.loc[idx, "q_mean"] = train.loc[idx, "q_mean_imp"]
idx = train[train.q_std == 0].index
train.loc[idx, "q_std"] = train.loc[idx, "q_std_imp"]

train.drop(["q_mean_imp","q_std_imp","item_domain_id"], axis=1, inplace=True)

In [10]:
train = reduce_mem_usage(train, verbose=True)
gc.collect()

Mem. usage decreased to 3833.42 Mb (19.6% reduction)


51

In [11]:
train.loc[:,["currency","listing_type","shipping_logistic_type","shipping_payment"]].nunique()

currency                  4
listing_type              2
shipping_logistic_type    3
shipping_payment          2
dtype: int64

In [12]:
encoder = ce.OneHotEncoder(cols=["weekday","listing_type","shipping_logistic_type","shipping_payment"], use_cat_names=True)
train = encoder.fit_transform(train)
train

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,q_mean,q_std
0,464801,2021-02-01,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,1,0,0,0,0,0,0,0.254237,0.562482
1,464801,2021-02-02,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,1,0,0,0,0,0,0.254237,0.562482
2,464801,2021-02-03,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,0,1,0,0,0,0,0.254237,0.562482
3,464801,2021-02-04,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,1,0,0,0,0.254237,0.562482
4,464801,2021-02-05,1,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,0,1,0,0,0.254237,0.562482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.000000,ARG,1,0,0,0,1,1,0,267.710754,0,0,1,0,0,0,0,0.763131,0.698453
37660275,6707,2021-03-31,0,26999.000000,ARG,1,0,0,1,0,1,0,266.083344,0,0,1,0,0,0,0,2.063710,1.428639
37660276,170355,2021-03-31,0,3400.000000,ARG,1,0,0,0,1,0,1,0.252633,0,0,1,0,0,0,0,0.813597,0.442502
37660277,246568,2021-03-31,0,6289.000000,ARG,1,0,1,0,0,1,0,135.416672,0,0,1,0,0,0,0,2.432324,1.715082


In [13]:
test = pd.read_csv(f"{input_path}/test_data.csv")
train = pd.merge(train, test, how="inner", on="sku")
train = reduce_mem_usage(train, verbose=True)
gc.collect()
train

Mem. usage decreased to 1849.78 Mb (62.7% reduction)


Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,q_mean,q_std,target_stock
0,464801,2021-02-01,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,1,0,0,0,0,0,0,0.254237,0.562482,3
1,464801,2021-02-02,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,1,0,0,0,0,0,0.254237,0.562482,3
2,464801,2021-02-03,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,0,1,0,0,0,0,0.254237,0.562482,3
3,464801,2021-02-04,0,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,1,0,0,0,0.254237,0.562482,3
4,464801,2021-02-05,1,156.779999,REA,1,0,1,0,0,1,0,1440.000000,0,0,0,0,1,0,0,0.254237,0.562482,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31284422,129187,2021-03-31,0,22057.000000,ARG,1,0,0,0,1,1,0,267.710754,0,0,1,0,0,0,0,0.763131,0.698453,8
31284423,6707,2021-03-31,0,26999.000000,ARG,1,0,0,1,0,1,0,266.083344,0,0,1,0,0,0,0,2.063710,1.428639,30
31284424,170355,2021-03-31,0,3400.000000,ARG,1,0,0,0,1,0,1,0.252633,0,0,1,0,0,0,0,0.813597,0.442502,3
31284425,246568,2021-03-31,0,6289.000000,ARG,1,0,1,0,0,1,0,135.416672,0,0,1,0,0,0,0,2.432324,1.715082,2


In [14]:
cols_sum = [
    "listing_type_classic",
    "listing_type_premium",
    "shipping_logistic_type_fulfillment",
    "shipping_logistic_type_cross_docking",
    "shipping_logistic_type_drop_off",
    "shipping_payment_free_shipping",
    "shipping_payment_paid_shipping",
    "weekday_1",
    "weekday_2",
    "weekday_3",
    "weekday_4",
    "weekday_5",
    "weekday_6",
    "weekday_7",    
]
cols_mean = [
    "minutes_active",
]

def compute_features(df):
    target_stock = df.target_stock.values[0]
    start_index = df.index.min()
    end_index = df.index.max()

    if len(df) < 30:
        return None

    inventory_days = list()
    sum_values = list()
    mean_values = list()

    for i in df.index:
        if i+29 > end_index: break

        df_slice = df.loc[i:i+29]
        cumsum = df_slice.sold_quantity.values.cumsum()
        idxs = np.argwhere(cumsum >= target_stock)
        if len(idxs)==0:
            idays = np.inf
        else:
            idays = np.min(idxs) + 1

        inventory_days.append(idays)
        sum_values.append(df_slice[cols_sum].sum(axis=0).values)
        mean_values.append(df_slice[cols_mean].mean(axis=0).values)

    df = df.head(len(inventory_days)).copy()
    df["inventory_days"] = inventory_days
    df[cols_sum] = np.asarray(sum_values)
    df[cols_mean] = np.asarray(mean_values)
    
    return df

with Parallel(n_jobs=6) as parallel:
    delayed_func = delayed(compute_features)
    all_dfs = parallel(delayed_func(df) for _,df in tqdm(train.groupby("sku")))

100%|██████████| 551472/551472 [1:57:17<00:00, 78.36it/s]  


In [15]:
train = pd.concat(filter(lambda x: x is not None, all_dfs), ignore_index=True)
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,q_mean,q_std,target_stock,inventory_days
0,0,2021-02-01,0,172.789993,REA,30,0,0,30,0,30,0,1325.088257,5,5,4,4,4,4,4,0.215558,0.482631,1,inf
1,0,2021-02-02,0,172.789993,REA,30,0,0,30,0,30,0,1373.088257,4,5,5,4,4,4,4,0.215558,0.482631,1,inf
2,0,2021-02-03,0,179.990005,REA,30,0,0,30,0,30,0,1421.088257,4,4,5,5,4,4,4,0.215558,0.482631,1,inf
3,0,2021-02-04,0,179.990005,REA,30,0,0,30,0,30,0,1440.000000,4,4,4,5,5,4,4,0.215558,0.482631,1,inf
4,0,2021-02-05,0,179.990005,REA,30,0,0,30,0,30,0,1440.000000,4,4,4,4,5,5,4,0.215558,0.482631,1,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15524803,660914,2021-02-26,1,59.990002,REA,0,30,30,0,0,0,30,1440.000000,4,4,4,4,5,5,4,0.423169,0.759259,5,8.0
15524804,660914,2021-02-27,0,59.990002,REA,0,30,30,0,0,0,30,1440.000000,4,4,4,4,4,5,5,0.423169,0.759259,5,9.0
15524805,660914,2021-02-28,1,59.990002,REA,0,30,30,0,0,0,30,1440.000000,5,4,4,4,4,4,5,0.423169,0.759259,5,8.0
15524806,660914,2021-03-01,0,59.990002,REA,0,30,30,0,0,0,30,1440.000000,5,5,4,4,4,4,4,0.423169,0.759259,5,9.0


In [16]:
train = pd.merge(train, metadata, how="inner", on="sku")
train

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,q_mean,q_std,target_stock,inventory_days,item_domain_id,item_id,site_id,product_id,product_family_id
0,0,2021-02-01,0,172.789993,REA,30,0,0,30,0,30,0,1325.088257,5,5,4,4,4,4,4,0.215558,0.482631,1,inf,MLB-SNEAKERS,492155,MLB,,MLB15832732
1,0,2021-02-02,0,172.789993,REA,30,0,0,30,0,30,0,1373.088257,4,5,5,4,4,4,4,0.215558,0.482631,1,inf,MLB-SNEAKERS,492155,MLB,,MLB15832732
2,0,2021-02-03,0,179.990005,REA,30,0,0,30,0,30,0,1421.088257,4,4,5,5,4,4,4,0.215558,0.482631,1,inf,MLB-SNEAKERS,492155,MLB,,MLB15832732
3,0,2021-02-04,0,179.990005,REA,30,0,0,30,0,30,0,1440.000000,4,4,4,5,5,4,4,0.215558,0.482631,1,inf,MLB-SNEAKERS,492155,MLB,,MLB15832732
4,0,2021-02-05,0,179.990005,REA,30,0,0,30,0,30,0,1440.000000,4,4,4,4,5,5,4,0.215558,0.482631,1,inf,MLB-SNEAKERS,492155,MLB,,MLB15832732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15524803,660914,2021-02-26,1,59.990002,REA,0,30,30,0,0,0,30,1440.000000,4,4,4,4,5,5,4,0.423169,0.759259,5,8.0,MLB-SNEAKERS,490874,MLB,,
15524804,660914,2021-02-27,0,59.990002,REA,0,30,30,0,0,0,30,1440.000000,4,4,4,4,4,5,5,0.423169,0.759259,5,9.0,MLB-SNEAKERS,490874,MLB,,
15524805,660914,2021-02-28,1,59.990002,REA,0,30,30,0,0,0,30,1440.000000,5,4,4,4,4,4,5,0.423169,0.759259,5,8.0,MLB-SNEAKERS,490874,MLB,,
15524806,660914,2021-03-01,0,59.990002,REA,0,30,30,0,0,0,30,1440.000000,5,5,4,4,4,4,4,0.423169,0.759259,5,9.0,MLB-SNEAKERS,490874,MLB,,


In [17]:
train = reduce_mem_usage(train, verbose=True)
train.to_parquet("../data/train-m2.parquet", index=False)

Mem. usage decreased to 1569.39 Mb (49.0% reduction)


***
## preparing test data

In [5]:
train = pd.read_parquet(f"{input_path}/train-m2.parquet")
metadata = pd.read_csv(f"{input_path}/metadata.csv")
print(train.sku.nunique())

531832


In [6]:
# we know that all the skus reach the stock out in the time 30 days
train = train.query("inventory_days <= 30").reset_index(drop=True)
train.sku.nunique()

441834

In [7]:
currency_map = train[["sku","currency"]].drop_duplicates()
currency_map[currency_map.duplicated(subset=["sku"])]
currency_map.query("sku == 1169")

Unnamed: 0,sku,currency
18980,1169,DOL
18982,1169,MEX


In [8]:
cols_to_agg = [
    "current_price",
    "listing_type_classic",
    "listing_type_premium",
    "shipping_logistic_type_fulfillment",
    "shipping_logistic_type_cross_docking",
    "shipping_logistic_type_drop_off",
    "shipping_payment_free_shipping",
    "shipping_payment_paid_shipping",
    "minutes_active",
    "q_mean",
    "q_std",
    "target_stock",  
]
# todo: weighted mean giving more weight to more recent observations
test = train.groupby("sku")[cols_to_agg].mean().reset_index()
test

Unnamed: 0,sku,current_price,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,q_mean,q_std,target_stock
0,0,179.990005,30.0,0.0,10.0,20.0,0.0,30.000000,0.000000,1172.370605,0.215558,0.482631,1.0
1,1,135.899994,0.0,30.0,30.0,0.0,0.0,30.000000,0.000000,1057.419678,0.763534,0.945334,13.0
2,3,399.000000,0.0,30.0,30.0,0.0,0.0,30.000000,0.000000,594.850708,0.316329,0.216265,3.0
3,5,580.176636,0.0,30.0,30.0,0.0,0.0,30.000000,0.000000,1440.000000,0.453772,0.429372,12.0
4,6,490.000000,30.0,0.0,30.0,0.0,0.0,30.000000,0.000000,963.612732,0.619325,0.633861,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
441829,660908,28.726841,0.0,30.0,30.0,0.0,0.0,0.000000,30.000000,1362.611938,1.619959,1.634653,29.0
441830,660909,245.000000,30.0,0.0,30.0,0.0,0.0,0.000000,30.000000,1440.000000,0.372881,0.471129,2.0
441831,660911,13.144333,30.0,0.0,30.0,0.0,0.0,0.000000,30.000000,1041.214478,0.385074,0.487071,4.0
441832,660912,99.000000,30.0,0.0,30.0,0.0,0.0,0.000000,30.000000,1233.540894,0.254217,0.534028,2.0


In [9]:
days_count = {
    "weekday_1":0,
    "weekday_2":0,
    "weekday_3":0,
    "weekday_4":0,
    "weekday_5":0,
    "weekday_6":0,
    "weekday_7":0,
}

for dt in pd.date_range("2021-04-01", "2021-04-30"):
    weekday = dt.weekday()+1
    days_count[f"weekday_{weekday}"] += 1
display(days_count)

for col,value in days_count.items():
    test[col] = value

{'weekday_1': 4,
 'weekday_2': 4,
 'weekday_3': 4,
 'weekday_4': 5,
 'weekday_5': 5,
 'weekday_6': 4,
 'weekday_7': 4}

In [10]:
test = pd.merge(test, metadata, how="inner", on="sku")
test

Unnamed: 0,sku,current_price,listing_type_classic,listing_type_premium,shipping_logistic_type_fulfillment,shipping_logistic_type_cross_docking,shipping_logistic_type_drop_off,shipping_payment_free_shipping,shipping_payment_paid_shipping,minutes_active,q_mean,q_std,target_stock,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,item_domain_id,item_id,site_id,product_id,product_family_id
0,0,179.990005,30.0,0.0,10.0,20.0,0.0,30.000000,0.000000,1172.370605,0.215558,0.482631,1.0,4,4,4,5,5,4,4,MLB-SNEAKERS,492155,MLB,,MLB15832732
1,1,135.899994,0.0,30.0,30.0,0.0,0.0,30.000000,0.000000,1057.419678,0.763534,0.945334,13.0,4,4,4,5,5,4,4,MLB-SURFBOARD_RACKS,300279,MLB,,
2,3,399.000000,0.0,30.0,30.0,0.0,0.0,30.000000,0.000000,594.850708,0.316329,0.216265,3.0,4,4,4,5,5,4,4,MLM-RINGS,298603,MLM,,
3,5,580.176636,0.0,30.0,30.0,0.0,0.0,30.000000,0.000000,1440.000000,0.453772,0.429372,12.0,4,4,4,5,5,4,4,MLM-RADIO_FREQUENCY_MACHINES,124265,MLM,,
4,6,490.000000,30.0,0.0,30.0,0.0,0.0,30.000000,0.000000,963.612732,0.619325,0.633861,3.0,4,4,4,5,5,4,4,MLM-TABLET_CASES,345180,MLM,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441829,660908,28.726841,0.0,30.0,30.0,0.0,0.0,0.000000,30.000000,1362.611938,1.619959,1.634653,29.0,4,4,4,5,5,4,4,MLB-DATA_CABLES_AND_ADAPTERS,497085,MLB,,
441830,660909,245.000000,30.0,0.0,30.0,0.0,0.0,0.000000,30.000000,1440.000000,0.372881,0.471129,2.0,4,4,4,5,5,4,4,MLA-PENCILS,430327,MLA,,
441831,660911,13.144333,30.0,0.0,30.0,0.0,0.0,0.000000,30.000000,1041.214478,0.385074,0.487071,4.0,4,4,4,5,5,4,4,MLB-CELLPHONE_PARTS,320792,MLB,,
441832,660912,99.000000,30.0,0.0,30.0,0.0,0.0,0.000000,30.000000,1233.540894,0.254217,0.534028,2.0,4,4,4,5,5,4,4,MLM-AUTOMOTIVE_EMBLEMS,90441,MLM,,


In [11]:
test = reduce_mem_usage(test, verbose=True)
test.to_parquet("../data/test-m2.parquet", index=False)

Mem. usage decreased to 43.40 Mb (46.4% reduction)


***