In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_theme(style='whitegrid')

DATA_DIR = Path('../data/raw')

users = pd.read_csv(DATA_DIR / 'users.csv', parse_dates=['created_at'])
orders = pd.read_csv(DATA_DIR / 'orders.csv', parse_dates=['created_at'])

In [None]:
users['cohort_month'] = users['created_at'].dt.to_period('M')

completed = orders[orders['status'] == 'completed'].copy()
completed['order_month'] = completed['created_at'].dt.to_period('M')

merged = completed.merge(users[['id', 'cohort_month']], left_on='user_id', right_on='id')
merged['period_number'] = (merged['order_month'] - merged['cohort_month']).apply(lambda x: x.n)

cohort_sizes = users.groupby('cohort_month')['id'].count().rename('cohort_size')

cohort_data = merged.groupby(['cohort_month', 'period_number'])['user_id'].nunique().reset_index()
cohort_data = cohort_data.merge(cohort_sizes, on='cohort_month')
cohort_data['retention_rate'] = (cohort_data['user_id'] / cohort_data['cohort_size'] * 100).round(2)

cohort_pivot = cohort_data.pivot_table(
    index='cohort_month', columns='period_number', values='retention_rate'
)
cohort_pivot.index = cohort_pivot.index.astype(str)
cohort_pivot.head()

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
sns.heatmap(
    cohort_pivot,
    annot=True,
    fmt='.1f',
    cmap='YlGnBu',
    linewidths=0.5,
    ax=ax
)
ax.set_title('Cohort Retention Rate (%)')
ax.set_xlabel('Months Since Signup')
ax.set_ylabel('Cohort Month')
plt.tight_layout()
plt.savefig('../data/processed/cohort_retention.png', dpi=150)
plt.show()

In [None]:
first_orders = completed.groupby('user_id')['created_at'].min().reset_index()
first_orders.columns = ['user_id', 'first_order_date']
first_orders['cohort_month'] = first_orders['first_order_date'].dt.to_period('M')

order_counts = completed.groupby('user_id')['id'].count().reset_index()
order_counts.columns = ['user_id', 'total_orders']

buyer_data = first_orders.merge(order_counts, on='user_id')
buyer_data['is_repeat'] = buyer_data['total_orders'] > 1

repeat_summary = buyer_data.groupby('cohort_month').agg(
    total_buyers=('user_id', 'count'),
    repeat_buyers=('is_repeat', 'sum')
).reset_index()

repeat_summary['repeat_rate_pct'] = (repeat_summary['repeat_buyers'] / repeat_summary['total_buyers'] * 100).round(2)
repeat_summary['cohort_month'] = repeat_summary['cohort_month'].astype(str)

fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(repeat_summary['cohort_month'], repeat_summary['repeat_rate_pct'], marker='o', color='purple')
ax.set_title('Repeat Purchase Rate by Cohort Month')
ax.set_ylabel('Repeat Purchase Rate (%)')
ax.set_xlabel('First Purchase Month')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('../data/processed/repeat_purchase_rate.png', dpi=150)
plt.show()

repeat_summary