In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

In [None]:
path = Path('data/events.csv')
if not path.exists():
    raise FileNotFoundError(f"Missing {path}. Run: python scripts/generate_demo_datasets.py (from repo root)")
events = pd.read_csv(path)
events.head()

In [None]:
events['signup_date'] = pd.to_datetime(events['signup_date'])
events['event_date'] = pd.to_datetime(events['event_date'])

events['signup_week'] = events['signup_date'].dt.to_period('W').dt.start_time
events['event_week'] = events['event_date'].dt.to_period('W').dt.start_time
events['week_index'] = ((events['event_week'] - events['signup_week']).dt.days // 7).astype(int)

events[['user_id','segment','signup_week','event_week','week_index','event']].head()

## Cohort table
We compute the number of active users by (signup week, week index), and normalize by cohort size.

In [None]:
cohort_sizes = events.groupby('signup_week')['user_id'].nunique()
active = (
    events.groupby(['signup_week','week_index'])['user_id'].nunique().rename('active_users').reset_index()
)
active = active.merge(cohort_sizes.rename('cohort_size'), on='signup_week', how='left')
active['retention'] = active['active_users'] / active['cohort_size']
active.head()

In [None]:
retention_matrix = active.pivot(index='signup_week', columns='week_index', values='retention').sort_index()
retention_matrix.iloc[:8, :10]

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(retention_matrix.iloc[:, :12], cmap='Blues', vmin=0, vmax=1)
plt.title('Weekly cohort retention (first 12 weeks)')
plt.xlabel('Week since signup')
plt.ylabel('Signup cohort (week)')
plt.show()

## Segment comparison
Optional: compare average retention curves by user segment.

In [None]:
seg_active = (
    events.groupby(['segment','signup_week','week_index'])['user_id'].nunique().rename('active_users').reset_index()
)
seg_sizes = events.groupby(['segment','signup_week'])['user_id'].nunique().rename('cohort_size').reset_index()
seg_active = seg_active.merge(seg_sizes, on=['segment','signup_week'], how='left')
seg_active['retention'] = seg_active['active_users'] / seg_active['cohort_size']

seg_curve = (
    seg_active.groupby(['segment','week_index'])['retention'].mean().reset_index()
)

plt.figure(figsize=(7,4))
sns.lineplot(data=seg_curve[seg_curve['week_index'] <= 12], x='week_index', y='retention', hue='segment')
plt.title('Average retention by segment (first 12 weeks)')
plt.xlabel('Week since signup')
plt.ylabel('Retention')
plt.ylim(0, 1)
plt.show()