<a href="https://colab.research.google.com/github/nihemelandu/ecommerce_analytics-churn_prediction/blob/main/Pass_1_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PASS 1: BASELINE CHURN PREDICTION MODEL



STAGE 1: DATA LOADING & PREPARATION

In [None]:
!pip install gcsfs --quiet
from google.colab import auth
auth.authenticate_user()

In [None]:
# ============================================================================
# CELL 1: ENVIRONMENT SETUP
# ============================================================================

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)
import warnings
warnings.filterwarnings('ignore')

# Configuration
RANDOM_SEED = 42
SAMPLE_RATE = 0.05
SAMPLE_RATE_str = "05"
MIN_SAMPLE_SIZE = 100000
MAX_REASONABLE_PRICE = 10000
EXPECTED_DATE_START = '2019-10-01'
EXPECTED_DATE_END = '2020-04-30'
SAMPLE_PATH = f'/content/combined_sample{SAMPLE_RATE_str}.csv'
np.random.seed(RANDOM_SEED)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

print("=" * 70)
print("PASS 1: BASELINE CHURN PREDICTION MODEL")
print("=" * 70)
print("\n✓ Environment setup complete")
print(f"✓ Random seed: {RANDOM_SEED}")
print(f"✓ Sample rate: {SAMPLE_RATE * 100}%")
print(f"✓ Libraries imported successfully\n")

PASS 1: BASELINE CHURN PREDICTION MODEL

✓ Environment setup complete
✓ Random seed: 42
✓ Sample rate: 5.0%
✓ Libraries imported successfully



In [None]:
# ============================================================================
# CELL 2: DATA LOADING
# ============================================================================

print("STAGE 1: DATA LOADING & PREPARATION")
print("-" * 70)
print("\nLoading data from CSV files...\n")

if os.path.exists(SAMPLE_PATH):
  print(f"Found existing sample at '{SAMPLE_PATH}' — loading...")
  sample_df = pd.read_csv(SAMPLE_PATH)
  print(f"✓ Loaded {len(sample_df):,} rows, {sample_df.shape[1]} columns from existing sample")
else:
  # Set seed for reproducibility
  np.random.seed(RANDOM_SEED)

  # Load samples from files
  files = ['gs://kaggle-rees46-ecommerce-datasets/2019-Oct.csv', 'gs://kaggle-rees46-ecommerce-datasets/2019-Nov.csv',
          'gs://kaggle-rees46-ecommerce-datasets/2019-Dec.csv', 'gs://kaggle-rees46-ecommerce-datasets/2020-Jan.csv',
          'gs://kaggle-rees46-ecommerce-datasets/2020-Feb.csv', 'gs://kaggle-rees46-ecommerce-datasets/2020-Mar.csv',
          'gs://kaggle-rees46-ecommerce-datasets/2020-Apr.csv']
  samples = []

  for file in files:
      print(f"\nLoading {file}...")
      try:
          sample = pd.read_csv(file, skiprows=lambda i: i > 0 and np.random.random() > SAMPLE_RATE)
          print(f"✓ Loaded {len(sample):,} rows from {file}")
          samples.append(sample)
      except FileNotFoundError:
          print(f"✗ {file} not found")
      except Exception as e:
          print(f"✗ Error loading {file}: {e}")

  # Combine samples
  if samples:
      sample_df = pd.concat(samples, ignore_index=True)
      print(f"\n✓ Combined sample: {len(sample_df):,} rows, {sample_df.shape[1]} columns")

      if len(sample_df) < MIN_SAMPLE_SIZE:
          print(f"⚠️  Warning: Sample size below minimum {MIN_SAMPLE_SIZE:,}")

      # Save combined sample to CSV
      output_path = f"combined_sample{SAMPLE_RATE_str}.csv"  # change this path as needed
      sample_df.to_csv(output_path, index=False)
      print(f"💾 Saved combined sample to '{output_path}'")
  else:
      raise ValueError("No data could be loaded!")

STAGE 1: DATA LOADING & PREPARATION
----------------------------------------------------------------------

Loading data from CSV files...


Loading gs://kaggle-rees46-ecommerce-datasets/2019-Oct.csv...


KeyboardInterrupt: 

In [None]:
# ============================================================================
# CELL 3: INITIAL DATA PROFILE
# ============================================================================

print("=" * 70)
print("INITIAL DATA PROFILE")
print("=" * 70)

# Dataset shape
print(f"\n📊 Dataset Shape: {sample_df.shape[0]:,} rows × {sample_df.shape[1]} columns\n")

# Column information
print("-" * 70)
print("Column Overview:")
print("-" * 70)
print(sample_df.dtypes)

# Missing data summary
print("\n" + "-" * 70)
print("Missing Data Summary:")
print("-" * 70)
missing_summary = pd.DataFrame({
    'Column': sample_df.columns,
    'Missing_Count': sample_df.isnull().sum(),
    'Missing_Percent': (sample_df.isnull().sum() / len(sample_df) * 100).round(2)
})
print(missing_summary[missing_summary['Missing_Count'] > 0].to_string(index=False))

if missing_summary['Missing_Count'].sum() == 0:
    print("✓ No missing values detected")

# Date range
print("\n" + "-" * 70)
print("Temporal Coverage:")
print("-" * 70)
if 'event_time' in sample_df.columns:
    # Try to parse dates without converting yet
    try:
        sample_dates = pd.to_datetime(sample_df['event_time'].head(1000))
        date_min = pd.to_datetime(sample_df['event_time']).min()
        date_max = pd.to_datetime(sample_df['event_time']).max()
        date_range_days = (date_max - date_min).days

        print(f"Start Date: {date_min}")
        print(f"End Date: {date_max}")
        print(f"Coverage: {date_range_days} days")
    except:
        print("⚠️  Could not parse event_time column")
else:
    print("⚠️  No 'event_time' column found")

# Event type distribution
print("\n" + "-" * 70)
print("Event Type Distribution:")
print("-" * 70)
if 'event_type' in sample_df.columns:
    event_counts = sample_df['event_type'].value_counts()
    event_pct = (sample_df['event_type'].value_counts(normalize=True) * 100).round(2)

    event_summary = pd.DataFrame({
        'Event_Type': event_counts.index,
        'Count': event_counts.values,
        'Percentage': event_pct.values
    })
    print(event_summary.to_string(index=False))
else:
    print("⚠️  No 'event_type' column found")

# Quick peek at data
print("\n" + "-" * 70)
print("Sample Records (first 3 rows):")
print("-" * 70)
print(sample_df.head(3).to_string())

print("\n" + "=" * 70)
print("✓ Data profiling complete")
print("=" * 70 + "\n")

INITIAL DATA PROFILE

📊 Dataset Shape: 20,581,330 rows × 9 columns

----------------------------------------------------------------------
Column Overview:
----------------------------------------------------------------------
event_time        object
event_type        object
product_id         int64
category_id        int64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object

----------------------------------------------------------------------
Missing Data Summary:
----------------------------------------------------------------------
       Column  Missing_Count  Missing_Percent
category_code        3257979            15.83
        brand        2782529            13.52
 user_session             16             0.00

----------------------------------------------------------------------
Temporal Coverage:
----------------------------------------------------------------------
Start Date: 2019-10-01 

In [None]:
# ============================================================================
# CELL 4: MINIMAL DATA CLEANING
# ============================================================================

print("=" * 70)
print("MINIMAL DATA CLEANING")
print("=" * 70)

print(f"\nStarting rows: {len(sample_df):,}")

# Store original count for comparison
original_count = len(sample_df)

# Step 1: Convert event_time to datetime
print("\n[1/5] Converting event_time to datetime...")
sample_df['event_time'] = pd.to_datetime(sample_df['event_time'], utc=True)
print("✓ event_time converted to datetime")

# Step 2: Drop rows with null user_id, event_time, product_id, or event_type
print("\n[2/5] Dropping rows with null critical fields...")
critical_fields = ['user_id', 'event_time', 'product_id', 'event_type']
before_drop = len(sample_df)
sample_df = sample_df.dropna(subset=critical_fields)
dropped_critical = before_drop - len(sample_df)
print(f"✓ Dropped {dropped_critical:,} rows with null critical fields")

# Step 3: Remove negative or zero prices
print("\n[3/5] Removing invalid prices...")
before_price = len(sample_df)
sample_df = sample_df[sample_df['price'] > 0]
dropped_price = before_price - len(sample_df)
print(f"✓ Dropped {dropped_price:,} rows with price ≤ 0")

# Step 4: Remove exact duplicates
print("\n[4/5] Removing exact duplicates...")
before_dupes = len(sample_df)
sample_df = sample_df.drop_duplicates()
dropped_dupes = before_dupes - len(sample_df)
print(f"✓ Dropped {dropped_dupes:,} exact duplicate rows")

# Step 5: Fill nulls in brand and category_code with 'Unknown'
print("\n[5/5] Filling non-critical nulls...")
sample_df['brand'] = sample_df['brand'].fillna('Unknown')
sample_df['category_code'] = sample_df['category_code'].fillna('Unknown')
print("✓ Filled nulls in 'brand' and 'category_code' with 'Unknown'")

# Summary
print("\n" + "=" * 70)
print("CLEANING SUMMARY")
print("=" * 70)
print(f"Starting rows:        {original_count:>12,}")
print(f"Rows removed:         {original_count - len(sample_df):>12,}")
print(f"Final rows:           {len(sample_df):>12,}")
print(f"Retention rate:       {len(sample_df)/original_count*100:>11.2f}%")
print("=" * 70)

# Verify no nulls remain in critical fields
print("\n✓ Verification: Critical fields")
critical_nulls = sample_df[critical_fields].isnull().sum()
if critical_nulls.sum() == 0:
    print("  → No nulls in critical fields ✓")
else:
    print("  ⚠️  WARNING: Nulls still present:")
    print(critical_nulls[critical_nulls > 0])

# Verify data types
print("\n✓ Verification: Data types")
print(f"  → event_time: {sample_df['event_time'].dtype}")
print(f"  → price range: ${sample_df['price'].min():.2f} - ${sample_df['price'].max():.2f}")

print("\n" + "=" * 70)
print("✓ Data cleaning complete")
print("=" * 70 + "\n")

MINIMAL DATA CLEANING

Starting rows: 2,057,818

[1/5] Converting event_time to datetime...
✓ event_time converted to datetime

[2/5] Dropping rows with null critical fields...
✓ Dropped 0 rows with null critical fields

[3/5] Removing invalid prices...
✓ Dropped 3,571 rows with price ≤ 0

[4/5] Removing exact duplicates...
✓ Dropped 72 exact duplicate rows

[5/5] Filling non-critical nulls...
✓ Filled nulls in 'brand' and 'category_code' with 'Unknown'

CLEANING SUMMARY
Starting rows:           2,057,818
Rows removed:                3,643
Final rows:              2,054,175
Retention rate:             99.82%

✓ Verification: Critical fields
  → No nulls in critical fields ✓

✓ Verification: Data types
  → event_time: datetime64[ns, UTC]
  → price range: $0.14 - $2574.07

✓ Data cleaning complete



STAGE 2: EXPLORATORY DATA ANALYSIS

In [None]:
# ============================================================================
# CELL 5: PURCHASE OVERVIEW
# ============================================================================

print("\n" + "=" * 70)
print("STAGE 2: EXPLORATORY DATA ANALYSIS")
print("=" * 70)

print("\n" + "-" * 70)
print("PURCHASE OVERVIEW")
print("-" * 70)

# Filter to purchase events
purchases = sample_df[sample_df['event_type'] == 'purchase'].copy()

# Basic purchase metrics
total_purchases = len(purchases)
unique_purchasers = purchases['user_id'].nunique()
total_users = sample_df['user_id'].nunique()

# Purchase conversion rate
purchase_conversion_rate = (unique_purchasers / total_users) * 100

print(f"\nTotal Events:           {len(sample_df):>12,}")
print(f"Total Purchases:        {total_purchases:>12,}")
print(f"Unique Purchasers:      {unique_purchasers:>12,}")
print(f"Total Unique Users:     {total_users:>12,}")
print(f"Purchase Conversion:    {purchase_conversion_rate:>11.2f}%")

print("\n" + "-" * 70)
print("✓ Purchase overview complete")
print("-" * 70 + "\n")


STAGE 2: EXPLORATORY DATA ANALYSIS

----------------------------------------------------------------------
PURCHASE OVERVIEW
----------------------------------------------------------------------

Total Events:              2,054,175
Total Purchases:              34,146
Unique Purchasers:            32,675
Total Unique Users:        1,395,886
Purchase Conversion:           2.34%

----------------------------------------------------------------------
✓ Purchase overview complete
----------------------------------------------------------------------



In [None]:
#Do we have repeat purchasers?
purchases_df = purchases.loc[
    purchases['user_id'].isin(
        purchases['user_id'].value_counts()[lambda x: x > 1].index
    )
]
purchases_df.groupby('user_id')['event_type'].count().sort_values(ascending=False)

Unnamed: 0_level_0,event_type
user_id,Unnamed: 1_level_1
568782581,11
549109608,7
561242462,7
572994775,7
513320236,7
...,...
542092092,2
542065645,2
542035803,2
541814406,2


In [None]:
# ============================================================================
# CELL 6: USER SEGMENTATION
# ============================================================================

print("-" * 70)
print("USER SEGMENTATION")
print("-" * 70)

# Events per user
events_per_user = sample_df.groupby('user_id').size()

print(f"\nTotal Unique Users:     {total_users:>12,}")
print(f"\nEvents per User (Quartiles):")
print(f"  25th percentile:      {events_per_user.quantile(0.25):>12.0f}")
print(f"  50th percentile:      {events_per_user.quantile(0.50):>12.0f}")
print(f"  75th percentile:      {events_per_user.quantile(0.75):>12.0f}")
print(f"  Mean:                 {events_per_user.mean():>12.1f}")
print(f"  Max:                  {events_per_user.max():>12.0f}")

# Purchasers vs Non-purchasers
non_purchasers = total_users - unique_purchasers
purchaser_pct = (unique_purchasers / total_users) * 100
non_purchaser_pct = (non_purchasers / total_users) * 100

print(f"\nUser Activity Split:")
print(f"  Purchasers:           {unique_purchasers:>12,} ({purchaser_pct:.2f}%)")
print(f"  Non-purchasers:       {non_purchasers:>12,} ({non_purchaser_pct:.2f}%)")

print("\n" + "-" * 70)
print("✓ User segmentation complete")
print("-" * 70 + "\n")

----------------------------------------------------------------------
USER SEGMENTATION
----------------------------------------------------------------------

Total Unique Users:        1,395,886

Events per User (Quartiles):
  25th percentile:                 1
  50th percentile:                 1
  75th percentile:                 2
  Mean:                          1.5
  Max:                           996

User Activity Split:
  Purchasers:                 32,675 (2.34%)
  Non-purchasers:          1,363,211 (97.66%)

----------------------------------------------------------------------
✓ User segmentation complete
----------------------------------------------------------------------



In [None]:
# ============================================================================
# CELL 6b: USER SEGMENTATION FOR PURCHASERS
# ============================================================================

print("-" * 70)
print("USER SEGMENTATION FOR PURCHASERS")
print("-" * 70)

# Events per user
events_per_purchaser = purchases.groupby('user_id').size()

print(f"\nTotal Unique Users:     {total_users:>12,}")
print(f"\nEvents per Purchaser (Quartiles):")
print(f"  25th percentile:      {events_per_purchaser.quantile(0.25):>12.0f}")
print(f"  50th percentile:      {events_per_purchaser.quantile(0.50):>12.0f}")
print(f"  75th percentile:      {events_per_purchaser.quantile(0.75):>12.0f}")
print(f"  Mean:                 {events_per_purchaser.mean():>12.1f}")
print(f"  Max:                  {events_per_purchaser.max():>12.0f}")

# Repeat Purchasers vs One-time Purchasers
repeat_purchasers = purchases.user_id.value_counts()
repeat_purchasers = repeat_purchasers.loc[repeat_purchasers > 1]

one_time_purchasers = purchases.user_id.value_counts()
one_time_purchasers = one_time_purchasers.loc[one_time_purchasers == 1]

repeat_purchasers_pct = (len(repeat_purchasers) / purchases.user_id.nunique()) * 100
one_time_purchasers_pct = (len(one_time_purchasers) / purchases.user_id.nunique()) * 100

print(f"\nPurchaser Activity Split:")
print(f"  Repeat Purchasers:           {len(repeat_purchasers):>12,} ({repeat_purchasers_pct:.2f}%)")
print(f"  One-time Purchasers:       {len(one_time_purchasers):>12,} ({one_time_purchasers_pct:.2f}%)")

repeat_purchasers_df = purchases.loc[purchases.user_id.isin(repeat_purchasers.index)]
events_per_repeat_purchaser = repeat_purchasers_df.groupby('user_id')['event_type'].count().sort_values(ascending=False)
print(f"\nEvents per Repeat Purchaser (Distribution):")
print(events_per_repeat_purchaser.value_counts())
print("\nEvents per Repeat Purchaser (Quartiles):")
print(f"  25th percentile:      {events_per_repeat_purchaser.quantile(0.25):>12.0f}")
print(f"  50th percentile:      {events_per_repeat_purchaser.quantile(0.50):>12.0f}")
print(f"  75th percentile:      {events_per_repeat_purchaser.quantile(0.75):>12.0f}")
print(f"  Mean:                 {events_per_repeat_purchaser.mean():>12.1f}")
print(f"  Max:                  {events_per_repeat_purchaser.max():>12.0f}")

print("\n" + "-" * 70)
print("✓ User segmentation for purchasers complete")
print("-" * 70 + "\n")

----------------------------------------------------------------------
USER SEGMENTATION FOR PURCHASERS
----------------------------------------------------------------------

Total Unique Users:        1,395,886

Events per Purchaser (Quartiles):
  25th percentile:                 1
  50th percentile:                 1
  75th percentile:                 1
  Mean:                          1.0
  Max:                            11

Purchaser Activity Split:
  Repeat Purchasers:                  1,224 (3.75%)
  One-time Purchasers:             31,451 (96.25%)

Events per Repeat Purchaser (Distribution):
event_type
2     1055
3      126
4       25
5       10
7        4
6        3
11       1
Name: count, dtype: int64

Events per Repeat Purchaser (Quartiles):
  25th percentile:                 2
  50th percentile:                 2
  75th percentile:                 2
  Mean:                          2.2
  Max:                            11

--------------------------------------------------

In [None]:
# ============================================================================
# CELL 7: TEMPORAL COVERAGE
# ============================================================================

print("-" * 70)
print("TEMPORAL COVERAGE")
print("-" * 70)

# Extract date components
sample_df['year_month'] = sample_df['event_time'].dt.to_period('M')

# Purchases by month
purchases_by_month = purchases.groupby(
    purchases['event_time'].dt.to_period('M')
).size()

print(f"\nPurchases by Month:")
for period, count in purchases_by_month.items():
    print(f"  {period}:  {count:>8,} purchases")

# Data coverage
date_min = sample_df['event_time'].min()
date_max = sample_df['event_time'].max()
days_coverage = (date_max - date_min).days

print(f"\nData Coverage:")
print(f"  Start Date:           {date_min.strftime('%Y-%m-%d')}")
print(f"  End Date:             {date_max.strftime('%Y-%m-%d')}")
print(f"  Total Days:           {days_coverage} days")

print("\n" + "-" * 70)
print("✓ Temporal coverage complete")
print("-" * 70 + "\n")

----------------------------------------------------------------------
TEMPORAL COVERAGE
----------------------------------------------------------------------

Purchases by Month:
  2019-10:     3,734 purchases
  2019-11:     4,540 purchases
  2019-12:     5,909 purchases
  2020-01:     4,053 purchases
  2020-02:     5,923 purchases
  2020-03:     5,196 purchases
  2020-04:     4,791 purchases

Data Coverage:
  Start Date:           2019-10-01
  End Date:             2020-04-30
  Total Days:           212 days

----------------------------------------------------------------------
✓ Temporal coverage complete
----------------------------------------------------------------------



In [None]:
# ============================================================================
# CELL 7b: TEMPORAL COVERAGE FOR REPEAT PURCHASERS
# ============================================================================

print("-" * 70)
print("TEMPORAL COVERAGE FOR REPEAT PURCHASERS")
print("-" * 70)

# Extract date components
sample_df['year_month'] = sample_df['event_time'].dt.to_period('M')

# Purchases by month
purchases_per_month_by_repeaters = repeat_purchasers_df.groupby(
    repeat_purchasers_df['event_time'].dt.to_period('M')
).size()

print(f"\nPurchases by Month by Repeaters:")
for period, count in purchases_per_month_by_repeaters.items():
    print(f"  {period}:  {count:>8,} purchases")

# Data coverage
date_min = repeat_purchasers_df['event_time'].min()
date_max = repeat_purchasers_df['event_time'].max()
days_coverage = (date_max - date_min).days

print(f"\nData Coverage:")
print(f"  Start Date:           {date_min.strftime('%Y-%m-%d')}")
print(f"  End Date:             {date_max.strftime('%Y-%m-%d')}")
print(f"  Total Days:           {days_coverage} days")

print("\n" + "-" * 70)
print("✓ Temporal coverage for repeaters complete")
print("-" * 70 + "\n")

----------------------------------------------------------------------
TEMPORAL COVERAGE FOR REPEAT PURCHASERS
----------------------------------------------------------------------

Purchases by Month by Repeaters:
  2019-10:       245 purchases
  2019-11:       292 purchases
  2019-12:       522 purchases
  2020-01:       412 purchases
  2020-02:       625 purchases
  2020-03:       416 purchases
  2020-04:       183 purchases

Data Coverage:
  Start Date:           2019-10-01
  End Date:             2020-04-30
  Total Days:           212 days

----------------------------------------------------------------------
✓ Temporal coverage for repeaters complete
----------------------------------------------------------------------



In [None]:
# ============================================================================
# CELL 8: REPEAT PURCHASE ANALYSIS
# ============================================================================

print("-" * 70)
print("REPEAT PURCHASE ANALYSIS")
print("-" * 70)

# Count purchases per user
purchases_per_user = purchases.groupby('user_id').size()

# Distribution of purchase counts
purchase_distribution = purchases_per_user.value_counts().sort_index()

print(f"\nPurchases per User Distribution:")
print(f"  1 purchase:           {purchase_distribution.get(1, 0):>12,} users")
print(f"  2 purchases:          {purchase_distribution.get(2, 0):>12,} users")
print(f"  3+ purchases:         {purchases_per_user[purchases_per_user >= 3].count():>12,} users")

# Identify users eligible for churn analysis (2+ purchases)
repeat_purchasers = purchases_per_user[purchases_per_user >= 2].index
repeat_purchaser_count = len(repeat_purchasers)
repeat_purchaser_pct = (repeat_purchaser_count / unique_purchasers) * 100

print(f"\n" + "=" * 70)
print("CHURN MODELING ELIGIBILITY")
print("=" * 70)
print(f"Total Purchasers:       {unique_purchasers:>12,}")
print(f"Repeat Purchasers:      {repeat_purchaser_count:>12,} ({repeat_purchaser_pct:.2f}%)")
print(f"\n✓ Users available for churn modeling: {repeat_purchaser_count:,}")
print("=" * 70)

# Validation check
if repeat_purchaser_count < 1000:
    print("\n⚠️  WARNING: Very few repeat purchasers. Model may be unreliable.")
elif repeat_purchaser_count < 5000:
    print(f"\n⚠️  Note: Modest sample size ({repeat_purchaser_count:,}). Results may vary.")
else:
    print(f"\n✓ Sample size adequate for baseline modeling ({repeat_purchaser_count:,} users)")

print("\n" + "-" * 70)
print("✓ Repeat purchase analysis complete")
print("-" * 70 + "\n")

----------------------------------------------------------------------
REPEAT PURCHASE ANALYSIS
----------------------------------------------------------------------

Purchases per User Distribution:
  1 purchase:                 31,451 users
  2 purchases:                 1,055 users
  3+ purchases:                  169 users

CHURN MODELING ELIGIBILITY
Total Purchasers:             32,675
Repeat Purchasers:             1,224 (3.75%)

✓ Users available for churn modeling: 1,224

⚠️  Note: Modest sample size (1,224). Results may vary.

----------------------------------------------------------------------
✓ Repeat purchase analysis complete
----------------------------------------------------------------------



STAGE 3: CHURN DEFINITION & LABELING

In [None]:
# ============================================================================
# CELL 9: INTER-PURCHASE INTERVAL CALCULATION
# ============================================================================

print("\n" + "=" * 70)
print("STAGE 3: CHURN DEFINITION & LABELING")
print("=" * 70)

print("\n" + "-" * 70)
print("INTER-PURCHASE INTERVAL CALCULATION")
print("-" * 70)

# Filter to repeat purchasers only
repeat_purchaser_data = purchases[purchases['user_id'].isin(repeat_purchasers)].copy()
repeat_purchaser_data = repeat_purchaser_data.sort_values(['user_id', 'event_time'])

print(f"\nAnalyzing {len(repeat_purchasers):,} repeat purchasers...")

# Calculate inter-purchase intervals
intervals = []

for user_id in repeat_purchasers:
    user_purchases = repeat_purchaser_data[
        repeat_purchaser_data['user_id'] == user_id
    ]['event_time'].values

    # Calculate time between consecutive purchases
    for i in range(1, len(user_purchases)):
        interval_days = (pd.Timestamp(user_purchases[i]) -
                        pd.Timestamp(user_purchases[i-1])).days
        intervals.append(interval_days)

# Convert to Series for easier analysis
intervals_series = pd.Series(intervals)

print(f"✓ Calculated {len(intervals):,} inter-purchase intervals\n")

# Distribution statistics
print("Inter-Purchase Interval Distribution (days):")
print(f"  Mean:                 {intervals_series.mean():>12.1f}")
print(f"  Median:               {intervals_series.median():>12.1f}")
print(f"  Std Dev:              {intervals_series.std():>12.1f}")
print(f"  Min:                  {intervals_series.min():>12.0f}")
print(f"  Max:                  {intervals_series.max():>12.0f}")
print(f"\nPercentiles:")
print(f"  25th:                 {intervals_series.quantile(0.25):>12.1f}")
print(f"  50th (Median):        {intervals_series.quantile(0.50):>12.1f}")
print(f"  75th:                 {intervals_series.quantile(0.75):>12.1f}")
print(f"  90th:                 {intervals_series.quantile(0.90):>12.1f}")

print("\n" + "-" * 70)
print("✓ Inter-purchase interval calculation complete")
print("-" * 70 + "\n")


STAGE 3: CHURN DEFINITION & LABELING

----------------------------------------------------------------------
INTER-PURCHASE INTERVAL CALCULATION
----------------------------------------------------------------------

Analyzing 1,224 repeat purchasers...
✓ Calculated 1,471 inter-purchase intervals

Inter-Purchase Interval Distribution (days):
  Mean:                         27.9
  Median:                       14.0
  Std Dev:                      33.6
  Min:                             0
  Max:                           204

Percentiles:
  25th:                          3.0
  50th (Median):                14.0
  75th:                         41.0
  90th:                         73.0

----------------------------------------------------------------------
✓ Inter-purchase interval calculation complete
----------------------------------------------------------------------



In [None]:
# ============================================================================
# CELL 10: DEFINE CHURN THRESHOLD
# ============================================================================

print("-" * 70)
print("CHURN THRESHOLD DEFINITION")
print("-" * 70)

# Calculate median inter-purchase interval
median_interval = intervals_series.median()

# Apply 2× median rule (from Stage 0 document)
churn_threshold_days = int(2 * median_interval)

print(f"\nMedian Inter-Purchase Interval:  {median_interval:.1f} days")
print(f"Churn Threshold (2× Median):      {churn_threshold_days} days")

print(f"\n📌 CHURN DEFINITION:")
print(f"   A customer is considered CHURNED if they have not made")
print(f"   a purchase within {churn_threshold_days} days of their last purchase.")

# Business interpretation
print(f"\n💡 Business Interpretation:")
if churn_threshold_days < 30:
    print(f"   Very short repurchase cycle ({churn_threshold_days} days)")
    print(f"   → Likely high-frequency purchase category (consumables?)")
elif churn_threshold_days < 60:
    print(f"   Moderate repurchase cycle ({churn_threshold_days} days)")
    print(f"   → Typical for mixed product categories")
else:
    print(f"   Long repurchase cycle ({churn_threshold_days} days)")
    print(f"   → Likely infrequent purchase categories (durables?)")

print("\n" + "-" * 70)
print("✓ Churn threshold defined")
print("-" * 70 + "\n")

----------------------------------------------------------------------
CHURN THRESHOLD DEFINITION
----------------------------------------------------------------------

Median Inter-Purchase Interval:  14.0 days
Churn Threshold (2× Median):      28 days

📌 CHURN DEFINITION:
   A customer is considered CHURNED if they have not made
   a purchase within 28 days of their last purchase.

💡 Business Interpretation:
   Very short repurchase cycle (28 days)
   → Likely high-frequency purchase category (consumables?)

----------------------------------------------------------------------
✓ Churn threshold defined
----------------------------------------------------------------------



In [None]:
# ============================================================================
# CELL 11: CREATE CHURN LABELS
# ============================================================================

print("-" * 70)
print("CHURN LABEL CREATION")
print("-" * 70)

# Define observation cutoff date
# We need enough future data to observe churn
data_end_date = sample_df['event_time'].max()
observation_date = data_end_date - timedelta(days=churn_threshold_days)

print(f"\nData End Date:         {data_end_date.strftime('%Y-%m-%d')}")
print(f"Observation Date:      {observation_date.strftime('%Y-%m-%d')}")
print(f"Churn Threshold:       {churn_threshold_days} days")

# CRITICAL: Temporal leakage validation
days_in_future = (data_end_date - observation_date).days
print(f"\n🔍 Temporal Validation:")
print(f"   Days available to observe churn: {days_in_future}")
print(f"   Days needed (threshold):         {churn_threshold_days}")

assert days_in_future >= churn_threshold_days, \
    "❌ TEMPORAL LEAK: Insufficient future data to observe churn!"
print(f"   ✓ Sufficient future data available")

# Calculate last purchase date for each repeat purchaser
last_purchase_per_user = repeat_purchaser_data.groupby('user_id')['event_time'].max()

# Only consider users whose last purchase was before or at observation date
eligible_users = last_purchase_per_user[last_purchase_per_user <= observation_date]

print(f"\n📊 Label Creation:")
print(f"   Total repeat purchasers:         {len(repeat_purchasers):,}")
print(f"   Eligible for labeling:           {len(eligible_users):,}")
print(f"   Excluded (last purchase too late): {len(repeat_purchasers) - len(eligible_users):,}")

# Create labels
user_labels = []

for user_id, last_purchase in eligible_users.items():
    # Check if user made any purchase within churn_threshold days after observation
    future_purchases = repeat_purchaser_data[
        (repeat_purchaser_data['user_id'] == user_id) &
        (repeat_purchaser_data['event_time'] > last_purchase) &
        (repeat_purchaser_data['event_time'] <= last_purchase + timedelta(days=churn_threshold_days))
    ]

    # Label: 1 = Churned, 0 = Active
    churned = 1 if len(future_purchases) == 0 else 0

    user_labels.append({
        'user_id': user_id,
        'last_purchase_date': last_purchase,
        'churned': churned
    })

# Create labels dataframe
labels_df = pd.DataFrame(user_labels)

# Calculate churn rate
churn_rate = labels_df['churned'].mean() * 100
churned_count = labels_df['churned'].sum()
active_count = len(labels_df) - churned_count

print(f"\n" + "=" * 70)
print("CHURN LABELS SUMMARY")
print("=" * 70)
print(f"Total Users Labeled:    {len(labels_df):>12,}")
print(f"Churned Users:          {churned_count:>12,} ({churn_rate:.2f}%)")
print(f"Active Users:           {active_count:>12,} ({100-churn_rate:.2f}%)")
print("=" * 70)

# Validate churn rate is reasonable (15-35% expected)
if churn_rate < 15:
    print(f"\n⚠️  Note: Churn rate ({churn_rate:.1f}%) is below typical range (15-35%)")
    print(f"    This may indicate threshold is too short or data skew.")
elif churn_rate > 35:
    print(f"\n⚠️  Note: Churn rate ({churn_rate:.1f}%) is above typical range (15-35%)")
    print(f"    This may indicate threshold is too long.")
else:
    print(f"\n✓ Churn rate ({churn_rate:.1f}%) is within expected range (15-35%)")

print("\n" + "-" * 70)
print("✓ Churn labels created successfully")
print("-" * 70 + "\n")