In [4]:
import numpy as np
import pandas as pd
import scipy as sp
%matplotlib widget
import matplotlib.pyplot as plt


Se necesita de la dependencia pyarrow. (pip install pyarrow)

In [5]:
data_train = pd.read_parquet("Data/train_data.parquet",engine="pyarrow")
data_train['date'] =  pd.to_datetime(data_train['date'])


Exploramos data

In [6]:
print("Número de datos:",data_train.size)
print("Stock elements:" ,data_train.sku.unique().size)
data_train.sample(10)

Número de datos: 338942511
Stock elements: 660916


Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
12226674,392709,2021-03-30,0,34.99,REA,classic,fulfillment,paid_shipping,0.0
12468538,617723,2021-03-21,0,23.66,REA,classic,cross_docking,paid_shipping,1440.0
29107438,639094,2021-02-07,0,426.51,MEX,premium,fulfillment,free_shipping,0.0
14959376,139685,2021-02-20,0,2.99,REA,classic,fulfillment,paid_shipping,1440.0
17640994,60108,2021-02-03,0,99.0,REA,classic,cross_docking,free_shipping,1440.0
5282511,309165,2021-02-09,0,119.99,REA,premium,fulfillment,free_shipping,1440.0
12496350,620234,2021-02-23,0,7.63,REA,classic,fulfillment,paid_shipping,0.0
12946239,290358,2021-03-05,0,8.41,REA,classic,fulfillment,paid_shipping,1440.0
4610160,61356,2021-02-20,0,139.0,REA,premium,fulfillment,free_shipping,0.0
10470378,613194,2021-02-23,0,599.0,REA,classic,fulfillment,free_shipping,1440.0


## Attributes 	Description
**date** : ranges from 1-2-21 to 31-3-21  
**sold_quantity** : 	number of units of the corresponding SKU that were sold on that particular date.  
**current_price** 	currency in which the price is expressed.  
**currency** 	point in time correct listing price.  
**listing_type** 	type of listing the SKU had for that particular date. Possible values are classic or premium and they relate to the exposure the items receive and the fee charged to the seller as a sales comission.  
**shipping_logistic_type** 	type of shipping method the SKU offered, for that particular date. Possible values are fulfillment, cross_docking and drop_off.  
**shipping_payment** 	whether the shipping for the offered SKU at that particular date was free or paid, from the buyer's perspective.  
**minutes_active** 	number of minutes the SKU was available for purchase on that particular date.  

## One `sku` analysis

In [7]:
filt_sku = data_train[data_train.sku == 464801 ]
filt_sku.head()

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.0
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.0
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.0
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.0
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.0


In [8]:
fig, ax = plt.subplots(figsize = (10,10))
ax.plot_date(filt_sku.date,filt_sku.sold_quantity,
             **dict(marker="o",linestyle=":",alpha=.8,mfc="none"))
ax.set_title("sku 464801")


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  ax.plot_date(filt_sku.date,filt_sku.sold_quantity,


Text(0.5, 1.0, 'sku 464801')

In [9]:
# -- what is the range of days for this sku?
T_sku464801 = filt_sku.date.max() - filt_sku.date.min()
print(f"Period is {T_sku464801.days} days")

# -- what is the total amount of sales for this sku?
tot_sold_sku464801 = filt_sku["sold_quantity"].sum()

print(f"This sku as a total of {tot_sold_sku464801} sales")

Period is 58 days
This sku as a total of 15 sales


## Analysis by `sku`

In [10]:
# -- Do we have the same amount of data by days for every sku?
#    Let's check the sku vs T[days] distribution

# tt = train_raw_DF.groupby(by=["sku","date"])
maxt = data_train.loc[:,["sku","date"]].groupby(by=["sku"]).max()
mint = data_train.loc[:,["sku","date"]].groupby(by=["sku"]).min()
T = maxt - mint

# -- convert T to int.
T.date = T.date.dt.days


In [11]:
_fs = 16
bin_edges = [0,5,10,15,20,25,30,35,40,45,50,55,56,57,58,59,60,70]
plt.hist(x=T.date,bins=bin_edges,
         histtype="step",
         **dict(lw=2,alpha=.8))
plt.xlabel("T [days]",fontsize=_fs)
plt.ylabel("Count sku",fontsize=_fs)
plt.yscale("log")
plt.ylim((1e3,1e6))
plt.grid()
plt.show()

## Simple correlation analysis

There exists a correlation dependence on the number of elements sold?  
We calculate the correlations by `sku`. Since there are roughly 660000 sku, a correlation matrix would be of size 660000^2 elements, not practical. What to do?

In [12]:
sku_choices = np.random.choice(len(data_train.sku.unique()),2) #would Chooose to sku at random

In [13]:
x = data_train[data_train.sku == sku_choices[0]]
y = data_train[data_train.sku == sku_choices[1]]

In [14]:
corr = np.correlate(x.sold_quantity,y.sold_quantity,mode = "full") # Correlate, this mode is not a simple correlation but a convolution.

In [15]:
fig,ax = plt.subplots(1)
days = np.arange(-x.date.size,x.date.size-1)
ax.plot(days,corr)
ax.set_ylabel("correlation")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'correlation')

In [16]:
sku_list = data_train.sku.unique()
sku_pivot = data_train.pivot(index="sku",columns="date",values="sold_quantity")