In [16]:
import pandas as pd
import dask.dataframe as dd
from dask.delayed import delayed
from zipfile import ZipFile
import hvplot.pandas
import hvplot.dask

In [17]:
def load_zipped_csv(file_path):
    zipped_file = [delayed(pd.read_csv)(file_path)]
    return dd.from_delayed(zipped_file)

## Load the data

In [18]:
df_order_products = load_zipped_csv("../data/order_products__prior.csv.zip")
df_orders = load_zipped_csv("../data/orders.csv.zip")
df_products = load_zipped_csv("../data/products.csv.zip")
df_aisles = load_zipped_csv("../data/aisles.csv.zip")
df_departments = load_zipped_csv("../data/departments.csv.zip")

In [19]:
df_orders = df_orders[df_orders["eval_set"] == "prior"]
df_orders['order_id'] = df_orders['order_id'].astype(int)

df_order_products = df_order_products.groupby("order_id")["product_id"].apply(list).compute().to_frame().reset_index()
df_order_products['order_id'] = df_order_products['order_id'].astype(int)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  df_order_products = df_order_products.groupby("order_id")["product_id"].apply(list).compute().to_frame().reset_index()


In [20]:
df = df_orders.join(df_order_products, on='order_id', rsuffix="_")
# df = df.join(df_products, on='product_id', rsuffix="_")
# df = df.join(df_aisles, on='aisle_id', rsuffix="_")
# df = df.join(df_departments, on='department_id', rsuffix="_")

## Joined dataset metadata

In [21]:
df = df.compute()
df.head(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_id_,product_id
0,2539329,1,prior,1,2,8,,2702341.0,"[20728, 40338, 30721, 19068, 16798]"
1,2398795,1,prior,2,3,7,15.0,2552868.0,"[43961, 26047, 44372]"
2,473747,1,prior,3,3,12,21.0,504378.0,"[28682, 22825, 23554, 30489, 18340, 9839, 2796..."
3,2254736,1,prior,4,4,7,29.0,2399707.0,[5451]
4,431534,1,prior,5,4,15,28.0,459361.0,"[48287, 33810, 2164, 27086, 31557, 12545]"


In [29]:
n_rows = len(df)
n_rows

3214874

In [30]:
for col in df:
    try:
        print(col, len(df[col].unique()))
    except:
        pass

order_id 3214874
user_id 206209
eval_set 1
order_number 99
order_dow 7
order_hour_of_day 24
days_since_prior_order 32
order_id_ 3021074


## Straight forward data visualization

In [32]:
df_order_hour = df.groupby("order_hour_of_day").order_id.count().reset_index()
df_order_hour['percentage of orders'] = df_order_hour['order_id']/n_rows
df_order_hour.hvplot.bar(x="order_hour_of_day", y="percentage of orders", title="Percentage of orders per hour")

In [33]:
df_ = df.groupby("user_id").days_since_prior_order.mean().reset_index()
df_.hvplot.hist("days_since_prior_order" , title="Average number of days between orders")

In [35]:
df.hvplot.hist("days_since_prior_order" , title="Average number of days between orders")

In [36]:
df.hvplot.hist("order_dow" , title="Day of the week order")