In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.delayed import delayed
from zipfile import ZipFile
import hvplot.pandas
import hvplot.dask

In [2]:
def load_zipped_csv(file_path):
    zipped_file = [delayed(pd.read_csv)(file_path)]
    return dd.from_delayed(zipped_file)

## Load the data

In [3]:
df_order_products__prior = load_zipped_csv("../data/order_products__prior.csv.zip")
df_orders = load_zipped_csv("../data/orders.csv.zip")
df_products = load_zipped_csv("../data/products.csv.zip")
df_aisles = load_zipped_csv("../data/aisles.csv.zip")
df_departments = load_zipped_csv("../data/departments.csv.zip")

In [4]:
df_orders = df_orders[df_orders["eval_set"] == "prior"]

In [26]:
df = df_orders.join(df_order_products__prior, on='order_id', how="inner", rsuffix="_")
df = df.join(df_products, on='product_id', rsuffix="_")
df = df.join(df_aisles, on='aisle_id', rsuffix="_")
df = df.join(df_departments, on='department_id', rsuffix="_")

## Datasets metadata

## Joined dataset metadata

In [27]:
df = df.compute()
df.head(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_id_,product_id,add_to_cart_order,reordered,product_id_,product_name,aisle_id,department_id,aisle_id_,aisle,department_id_,department
0,2539329,1,prior,1,2,8,,267793,23236,6,1,23237.0,Organic Chunky Tomato Sauce,81.0,15.0,82.0,baby accessories,16.0,dairy eggs
1,2398795,1,prior,2,3,7,15.0,253040,39409,7,1,39410.0,Gatherings Turkey & Cheese Snack Tray,1.0,20.0,2.0,specialty cheeses,21.0,missing
2,473747,1,prior,3,3,12,21.0,50089,15400,3,0,15401.0,Dry Sweeping Pad Refills With Febreze Lavender...,100.0,21.0,101.0,air fresheners candles,,
3,2254736,1,prior,4,4,7,29.0,237822,41787,12,0,41788.0,Organic Creamy Cashewmilk,91.0,16.0,92.0,baby food formula,17.0,household
4,431534,1,prior,5,4,15,28.0,45690,38293,22,0,38294.0,"Pizza, Margherita, Rice Crust",79.0,1.0,80.0,deodorants,2.0,other


In [28]:
n_rows = len(df)
n_rows

3214874

In [29]:
for col in df:
    print(col, len(df[col].unique()))

order_id 3214874
user_id 206209
eval_set 1
order_number 99
order_dow 7
order_hour_of_day 24
days_since_prior_order 32
order_id_ 338325
product_id 43614
add_to_cart_order 126
reordered 2
product_id_ 43614
product_name 43614
aisle_id 135
department_id 22
aisle_id_ 134
aisle 134
department_id_ 21
department 21


In [30]:
df[df['order_id_'] == 253040]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_id_,product_id,add_to_cart_order,reordered,product_id_,product_name,aisle_id,department_id,aisle_id_,aisle,department_id_,department
1,2398795,1,prior,2,3,7,15.0,253040,39409,7,1,39410.0,Gatherings Turkey & Cheese Snack Tray,1.0,20.0,2.0,specialty cheeses,21.0,missing
462956,2398794,27868,prior,15,6,15,8.0,253040,29432,6,1,29433.0,Deli Shaved Oven Roasted Turkey,96.0,20.0,97.0,baking supplies decor,21.0,missing
1005382,2398789,60417,prior,30,4,19,3.0,253040,34458,1,1,34459.0,Cavity Protection Regular Toothpaste Gel 6.4 O...,20.0,11.0,21.0,packaged cheese,12.0,meat seafood
1106194,2398790,66568,prior,7,3,18,24.0,253040,18564,2,1,18565.0,Dry Roasted Lightly Salted Edamame,117.0,19.0,118.0,first aid,20.0,deli
2612014,2398796,157255,prior,7,6,10,12.0,253040,23288,8,1,23289.0,Robusto! Parmesan & Romano Pasta Sauce,9.0,9.0,10.0,kitchen supplies,10.0,bulk
3239709,2398793,195278,prior,12,2,20,3.0,253040,25466,5,1,25467.0,Madagascar Pure Vanilla,17.0,13.0,18.0,bulk dried fruits vegetables,14.0,breakfast


## Straight forward data visualization

In [10]:
df_order_hour = df.groupby("order_hour_of_day").order_id.count().reset_index()
df_order_hour['percentage of orders'] = df_order_hour['order_id']/n_rows
df_order_hour.hvplot.bar(x="order_hour_of_day", y="percentage of orders", title="Percentage of orders per hour")

In [17]:
df_order_hour = df.groupby("user_id").days_since_prior_order.mean().reset_index()
df_order_hour.hvplot.hist("days_since_prior_order" , title="Average number of days between orders")

In [None]:
df_order_hour = df.groupby("order_id_").days_since_prior_order.mean().reset_index()
df_order_hour.hvplot.hist("days_since_prior_order" , title="Average number of days between orders")