In [36]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.delayed import delayed
from zipfile import ZipFile
import hvplot.pandas
import hvplot.dask
import random
from datetime import date, datetime, timedelta
from tqdm import tqdm

In [2]:
def load_zipped_csv(file_path):
    zipped_file = [delayed(pd.read_csv)(file_path)]
    return dd.from_delayed(zipped_file)

In [3]:
def add_total_number_of_days(df):
    df_to_join = df.groupby("user_id").days_since_prior_order.sum().reset_index().rename(columns={"days_since_prior_order":"total_days_on_platform"})
    df_to_join['user_id'] = df_to_join['user_id'].astype(int)
    df = pd.merge(df, df_to_join, on=["user_id"])
    return df

In [16]:
def add_initial_date(df):
    pass

## Load the data

In [4]:
df_order_products = load_zipped_csv("../data/order_products__prior.csv.zip")
df_orders = load_zipped_csv("../data/orders.csv.zip")
# df_products = load_zipped_csv("../data/products.csv.zip")
# df_aisles = load_zipped_csv("../data/aisles.csv.zip")
# df_departments = load_zipped_csv("../data/departments.csv.zip")

In [5]:
df_orders = df_orders[df_orders["eval_set"] == "prior"]
# df_orders = df_orders[df_orders["user_id"].isin([4, 5, 6, 7])]
df_orders['order_id'] = df_orders['order_id'].astype(int)


df_order_products = df_order_products.groupby("order_id")["product_id"].apply(list).compute().to_frame().reset_index()
df_order_products['order_id'] = df_order_products['order_id'].astype(int)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  df_order_products = df_order_products.groupby("order_id")["product_id"].apply(list).compute().to_frame().reset_index()


In [6]:
df = df_orders.merge(df_order_products, on=["order_id"]).compute()

# df = df.join(df_products, on='product_id', rsuffix="_")
# df = df.join(df_aisles, on='aisle_id', rsuffix="_")
# df = df.join(df_departments, on='department_id', rsuffix="_")

In [7]:
df = add_total_number_of_days(df)

## Joined dataset metadata

In [8]:
df.head(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,total_days_on_platform
0,2539329,1,prior,1,2,8,,"[196, 14084, 12427, 26088, 26405]",176.0
1,2398795,1,prior,2,3,7,15.0,"[196, 10258, 12427, 13176, 26088, 13032]",176.0
2,473747,1,prior,3,3,12,21.0,"[196, 12427, 10258, 25133, 30450]",176.0
3,2254736,1,prior,4,4,7,29.0,"[196, 12427, 10258, 25133, 26405]",176.0
4,431534,1,prior,5,4,15,28.0,"[196, 12427, 10258, 25133, 10326, 17122, 41787...",176.0


In [9]:
n_rows = len(df)
n_rows

3214874

In [10]:
for col in df:
    try:
        print(col, len(df[col].unique()))
    except:
        pass

order_id 3214874
user_id 206209
eval_set 1
order_number 99
order_dow 7
order_hour_of_day 24
days_since_prior_order 32
total_days_on_platform 366


## Straight forward data visualization

In [11]:
df_order_hour = df.groupby("order_hour_of_day").order_id.count().reset_index()
df_order_hour['percentage of orders'] = df_order_hour['order_id']/n_rows
df_order_hour.hvplot.bar(x="order_hour_of_day", y="percentage of orders", title="Percentage of orders per hour")

In [12]:
df_ = df.groupby("user_id").days_since_prior_order.mean().reset_index()
df_.hvplot.hist("days_since_prior_order" , title="Average number of days between orders")

In [13]:
df.hvplot.hist("order_dow" , title="Day of the week order")

In [14]:
df_ = df.groupby("user_id").order_number.count().reset_index()
df_.hvplot.hist("order_number" , title="Average number of orders per clients", bins=50)

In [15]:
df_ = df.groupby("user_id").days_since_prior_order.sum().reset_index()
df_.hvplot.hist("days_since_prior_order" , title="Number of days since clients uses Instacart", bins=50)

In [80]:
test = df[df['days_since_prior_order'].isna()]
dates = []

delta_days = df["days_since_prior_order"].to_list()
total_days = df["total_days_on_platform"].to_list()

for day, total_day in tqdm(zip(delta_days, total_days), total=len(df), position=0, leave=True):
    if(np.isnan(day)):
        dates.append(date.today() - timedelta(days=total_day))
    else:
         dates.append(dates[-1] + timedelta(days=day))

# for i in tqdm(range(len(test)), position=0, leave=True):
#     dates.append(date.today() - timedelta(days=test.iloc[i]['total_days_on_platform'] + random.randint(-7, 7)))
# test["date"] = dates

df["date"] = dates

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3214874/3214874 [00:05<00:00, 587176.80it/s]


In [81]:
df_ = df.groupby("date").order_number.count().reset_index()
df_.hvplot(y="order_number", x="date")

In [79]:
df_

Unnamed: 0,date,order_number
0,2021-09-16,1
1,2021-09-17,1
2,2021-09-18,1
3,2021-09-19,4
4,2021-09-20,8
...,...,...
724,2023-09-10,683
725,2023-09-11,702
726,2023-09-12,593
727,2023-09-13,601
