In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.delayed import delayed
from zipfile import ZipFile
import hvplot.pandas
import hvplot.dask
import random
from datetime import date, datetime, timedelta
from tqdm import tqdm

In [None]:
def load_zipped_csv(file_path):
    zipped_file = [delayed(pd.read_csv)(file_path)]
    return dd.from_delayed(zipped_file)

In [None]:
def add_total_number_of_days(df):
    df_to_join = df.groupby("user_id").days_since_prior_order.sum().reset_index().rename(columns={"days_since_prior_order":"total_days_on_platform"})
    df_to_join['user_id'] = df_to_join['user_id'].astype(int)
    df = pd.merge(df, df_to_join, on=["user_id"])
    return df

In [None]:
def add_initial_date(df):
    pass

## Load the data

In [None]:
df_order_products = load_zipped_csv("../data/order_products__prior.csv.zip")
df_orders = load_zipped_csv("../data/orders.csv.zip")
df_products = load_zipped_csv("../data/products.csv.zip")
df_aisles = load_zipped_csv("../data/aisles.csv.zip")
df_departments = load_zipped_csv("../data/departments.csv.zip")

In [None]:
df_orders = df_orders[df_orders["eval_set"] == "prior"]
# df_orders = df_orders[df_orders["user_id"].isin([4, 5, 6, 7])]
df_orders['order_id'] = df_orders['order_id'].astype(int)


df_order_products = df_order_products.groupby("order_id")["product_id"].apply(list).compute().to_frame().reset_index()
df_order_products['order_id'] = df_order_products['order_id'].astype(int)

In [None]:
df = df_orders.merge(df_order_products, on=["order_id"]).compute()

# df = df.join(df_products, on='product_id', rsuffix="_")
# df = df.join(df_aisles, on='aisle_id', rsuffix="_")
# df = df.join(df_departments, on='department_id', rsuffix="_")

In [None]:
df = add_total_number_of_days(df)

## Joined dataset metadata

In [None]:
df.head(5)

In [None]:
n_rows = len(df)
n_rows

In [None]:
for col in df:
    try:
        print(col, len(df[col].unique()))
    except:
        pass

## Plots on orders

In [None]:
df_order_hour = df.groupby("order_hour_of_day").order_id.count().reset_index()
df_order_hour['percentage of orders'] = df_order_hour['order_id']/n_rows
df_order_hour.hvplot.bar(x="order_hour_of_day", y="percentage of orders", title="Percentage of orders per hour")

In [None]:
df_ = df.groupby("user_id").days_since_prior_order.mean().reset_index()
df_.hvplot.hist("days_since_prior_order" , title="Average number of days between orders")

In [None]:
df.hvplot.hist("order_dow" , title="Day of the week order")

In [None]:
df_ = df.groupby("user_id").order_number.count().reset_index()
df_.hvplot.hist("order_number" , title="Average number of orders per clients", bins=50)

In [None]:
df_ = df.groupby("user_id").days_since_prior_order.sum().reset_index()
df_.hvplot.hist("days_since_prior_order" , title="Number of days since clients uses Instacart", bins=50)

In [None]:
test = df[df['days_since_prior_order'].isna()]
dates = []

delta_days = df["days_since_prior_order"].to_list()
total_days = df["total_days_on_platform"].to_list()

for day, total_day in tqdm(zip(delta_days, total_days), total=len(df), position=0, leave=True):
    if(np.isnan(day)):
        dates.append(date.today() - timedelta(days=total_day-2))
    else:
         dates.append(dates[-1] + timedelta(days=day))

# for i in tqdm(range(len(test)), position=0, leave=True):
#     dates.append(date.today() - timedelta(days=test.iloc[i]['total_days_on_platform'] + random.randint(-7, 7)))
# test["date"] = dates

df["date"] = dates

In [None]:
df_ = df.groupby("date").order_number.count().reset_index()
df_["date"] = pd.to_datetime(df_["date"])
df_ = df_[(df_["date"].dt.date < date.today()) & (df_["date"].dt.date > date.today() - timedelta(days=120))]
df_.hvplot(y="order_number", x="date", title="Number of orders in the last 120 days")

In [None]:
df_ = df.groupby("date").order_number.count().reset_index()
df_["date"] = pd.to_datetime(df_["date"])
df_ = df_.resample("1d", on="date").sum().fillna(0).rolling(window=5, min_periods=1).mean().reset_index()
df_ = df_[(df_["date"].dt.date < date.today()) & (df_["date"].dt.date > date.today() - timedelta(days=8))]
df_["date"] = df_["date"].astype(str)
df_.hvplot.bar(y="order_number", x="date", title="Last 7 days average")

# Plots on products

In [None]:
l = df["product_id"].to_list()
l = [item for sublist in tqdm(l) for item in sublist]

In [None]:
df_product_occurrence = pd.DataFrame(l, columns=["product_id"])
df_product_occurrence = df_product_occurrence.value_counts().reset_index()
df_product_occurrence = df_product_occurrence.merge(df_products.compute(), on=["product_id"])
df_product_occurrence = df_product_occurrence.merge(df_aisles.compute(), on=["aisle_id"])
df_product_occurrence = df_product_occurrence.merge(df_departments.compute(), on=["department_id"])
df_product_occurrence

In [None]:
df_product_occurrence[0:5].hvplot.bar(y="count", x="product_name", title="Top 5 products")

In [None]:
df_top_aisles = pd.DataFrame(df_product_occurrence.groupby("aisle")["count"].sum()).reset_index()
df_top_aisles = df_top_aisles.sort_values("count", ascending=False)
df_top_aisles[0:5].hvplot.bar(y="count", x="aisle", title="Top 5 aisles")

In [None]:
df_top_departments = pd.DataFrame(df_product_occurrence.groupby("department")["count"].sum()).reset_index()
df_top_departments = df_top_departments.sort_values("count", ascending=False)
df_top_departments[0:5].hvplot.bar(y="count", x="department", title="Top 5 departments")