In [1]:
import numpy as np
import pandas as pd
import jsonlines
from tqdm import tqdm

***
## transactional data

In [2]:
!ls ../data

dataset.parquet               test_data.csv
items_static_metadata_full.jl train_data.parquet
sample_submission.csv.gz


In [3]:
data = pd.read_parquet("../data/train_data.parquet")
data

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.000000
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.000000
...,...,...,...,...,...,...,...,...,...
37660274,129187,2021-03-31,0,22057.00,ARG,classic,drop_off,free_shipping,267.710767
37660275,6707,2021-03-31,0,26999.00,ARG,classic,cross_docking,free_shipping,266.083333
37660276,170355,2021-03-31,0,3400.00,ARG,classic,drop_off,paid_shipping,0.252633
37660277,246568,2021-03-31,0,6289.00,ARG,classic,fulfillment,free_shipping,135.416667


In [4]:
data.sku.nunique()/1e3

660.916

In [5]:
data.drop_duplicates(subset=["sku"]).currency.value_counts() / 1e3

REA    336.334
MEX    265.908
ARG     58.670
DOL      0.004
Name: currency, dtype: float64

In [6]:
data.drop_duplicates(subset=["sku"]).listing_type.value_counts() / 1e3

premium    332.767
classic    328.149
Name: listing_type, dtype: float64

In [7]:
data.drop_duplicates(subset=["sku"]).shipping_logistic_type.value_counts() / 1e3

fulfillment      489.501
drop_off         104.131
cross_docking     67.284
Name: shipping_logistic_type, dtype: float64

In [8]:
data.drop_duplicates(subset=["sku"]).shipping_payment.value_counts() / 1e3

paid_shipping    438.793
free_shipping    222.123
Name: shipping_payment, dtype: float64

In [9]:
data.groupby("date")["sku"].count()

date
2021-02-01    604230
2021-02-02    606383
2021-02-03    608270
2021-02-04    610721
2021-02-05    612461
2021-02-06    613405
2021-02-07    613950
2021-02-08    615477
2021-02-09    616976
2021-02-10    618612
2021-02-11    620325
2021-02-12    621937
2021-02-13    622608
2021-02-14    622964
2021-02-15    624165
2021-02-16    625438
2021-02-17    626922
2021-02-18    628333
2021-02-19    629555
2021-02-20    630067
2021-02-21    630470
2021-02-22    631777
2021-02-23    633111
2021-02-24    634601
2021-02-25    636062
2021-02-26    637263
2021-02-27    637768
2021-02-28    638166
2021-03-01    639351
2021-03-02    640562
2021-03-03    641856
2021-03-04    643137
2021-03-05    644620
2021-03-06    645072
2021-03-07    645432
2021-03-08    646451
2021-03-09    647382
2021-03-10    648410
2021-03-11    649447
2021-03-12    650309
2021-03-13    650732
2021-03-14    651006
2021-03-15    651728
2021-03-16    652524
2021-03-17    653317
2021-03-18    654154
2021-03-19    654845
2021-03-

In [10]:
# number of rows by sku
len(data)/data.sku.nunique()

56.9819447554606

In [11]:
data.groupby("sku")["date"].count().describe()

count    660916.000000
mean         56.981945
std           8.094631
min           1.000000
25%          59.000000
50%          59.000000
75%          59.000000
max          59.000000
Name: date, dtype: float64

In [14]:
data.groupby("sku")["sold_quantity"].mean().describe()

count    660916.000000
mean          0.983020
std           6.988492
min           0.000000
25%           0.203390
50%           0.338983
75%           0.711864
max        2330.983051
Name: sold_quantity, dtype: float64

In [15]:
data.groupby("sku")["date"].count().describe()

count    660916.000000
mean         56.981945
std           8.094631
min           1.000000
25%          59.000000
50%          59.000000
75%          59.000000
max          59.000000
Name: date, dtype: float64

In [16]:
# % of null by column
data.isna().sum(axis=0)/len(data)

sku                       0.0
date                      0.0
sold_quantity             0.0
current_price             0.0
currency                  0.0
listing_type              0.0
shipping_logistic_type    0.0
shipping_payment          0.0
minutes_active            0.0
dtype: float64

***
## test data

In [17]:
test = pd.read_csv("../data/test_data.csv")

In [18]:
test.sku.nunique() / data.sku.nunique()

0.8344055825551204

In [19]:
set(test.sku) < set(data.sku)

True

In [20]:
test.target_stock.describe()

count    551472.000000
mean         18.069472
std         122.711151
min           1.000000
25%           3.000000
50%           6.000000
75%          13.000000
max       32710.000000
Name: target_stock, dtype: float64

In [21]:
data.query("sku in @test.sku").groupby(["sku"])["date"].count().describe()

count    551472.000000
mean         56.728949
std           8.642053
min           1.000000
25%          59.000000
50%          59.000000
75%          59.000000
max          59.000000
Name: date, dtype: float64

In [22]:
data.query("sku in @test.sku").groupby(["sku"])["sold_quantity"].sum().describe()

count    551472.000000
mean         57.920747
std         442.711305
min           0.000000
25%          10.000000
50%          19.000000
75%          40.000000
max      137528.000000
Name: sold_quantity, dtype: float64

***
## SKU metadata

In [23]:
all_records = list()

with jsonlines.open('../data/items_static_metadata_full.jl') as reader:
    for obj in tqdm(reader):
        all_records.append(obj)

660916it [00:03, 194753.41it/s]


In [24]:
metadata = pd.DataFrame(all_records)
metadata

Unnamed: 0,item_domain_id,item_id,item_title,site_id,sku,product_id,product_family_id
0,MLB-SNEAKERS,492155,Tênis Masculino Olympikus Cyber Barato Promoçao,MLB,0,,MLB15832732
1,MLB-SURFBOARD_RACKS,300279,Suporte Rack Prancha Parede C/ Regulagem Horiz...,MLB,1,,
2,MLM-NECKLACES,69847,5 Collares Plateados Dama Gargantilla Choker -...,MLM,2,,
3,MLM-RINGS,298603,Lindo Anillo De Bella Crepusculo Twilight Prom...,MLM,3,,
4,MLB-WEBCAMS,345949,Webcam Com Microfone Hd 720p Knup Youtube Pc V...,MLB,4,,
...,...,...,...,...,...,...,...
660911,MLB-CELLPHONE_PARTS,320792,2 Parafuso Pentalobe Torx iPhone 5 5c 5s 6 6s ...,MLB,660911,,
660912,MLM-AUTOMOTIVE_EMBLEMS,90441,Emblema Circular Wolfsburg Edition 5cm,MLM,660912,,
660913,MLB-SOFA_AND_FUTON_COVERS,202580,Kit Capa De Sofá Coladinha Jogo 2e3 Lugares Ma...,MLB,660913,,
660914,MLB-SNEAKERS,490874,Calçado Sapatênis Clássico Baltic 234 Ollie,MLB,660914,,


In [25]:
# % of null by column
metadata.isna().sum(axis=0)/len(metadata)

item_domain_id       0.000005
item_id              0.000000
item_title           0.000000
site_id              0.000000
sku                  0.000000
product_id           0.955111
product_family_id    0.874138
dtype: float64

In [26]:
# number of categories by column
metadata.nunique()

item_domain_id         8408
item_id              517896
item_title           478175
site_id                   3
sku                  660916
product_id            15863
product_family_id     29600
dtype: int64

***