<a href="https://colab.research.google.com/github/nandu26m/Customer-Analytics-Churn-Cohort-Project/blob/main/src/Customer_Analytics_Churn_Cohort_Dataset_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faker pandas

Collecting faker
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.0


In [4]:
# Install dependencies
!pip install faker pandas --quiet

import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
import zipfile
from IPython.display import display, FileLink

fake = Faker()
Faker.seed(42)
random.seed(42)

NUM_USERS = 10000
NUM_SUBS = 15000
NUM_LOGINS = 50000
NUM_PAYMENTS = 15000
NUM_TICKETS = 5000
NUM_PROMOS = 50
PLAN_IDS = [1, 2, 3]

# USERS
users = []
for user_id in range(1, NUM_USERS + 1):
    signup_date = fake.date_between(start_date='-2y', end_date='today')
    users.append({
        "user_id": user_id,
        "email": fake.unique.email(),
        "signup_date": signup_date,
        "country": fake.country(),
        "language": random.choice(['en', 'de', 'fr', 'es']),
        "device_type": random.choice(['iOS', 'Android', 'Web']),
        "acquisition_channel": random.choice(['organic', 'ads', 'referral'])
    })
df_users = pd.DataFrame(users)
df_users.to_csv("users.csv", index=False)

# SUBSCRIPTION PLANS
plans = [
    {"plan_id": 1, "plan_name": "Basic", "plan_type": "monthly", "price": 9.99, "duration_months": 1, "is_active": True},
    {"plan_id": 2, "plan_name": "Pro", "plan_type": "monthly", "price": 19.99, "duration_months": 1, "is_active": True},
    {"plan_id": 3, "plan_name": "Pro Annual", "plan_type": "annual", "price": 199.99, "duration_months": 12, "is_active": True}
]
df_plans = pd.DataFrame(plans)
df_plans.to_csv("subscription_plans.csv", index=False)

# SUBSCRIPTIONS
subscriptions = []
for sub_id in range(1, NUM_SUBS + 1):
    user_id = random.randint(1, NUM_USERS)
    plan_id = random.choice(PLAN_IDS)
    plan = plans[plan_id - 1]
    start_date = fake.date_between(start_date='-2y', end_date='today')
    expiry_date = start_date + timedelta(days=plan["duration_months"] * 30)
    subscriptions.append({
        "subscription_id": sub_id,
        "user_id": user_id,
        "plan_id": plan_id,
        "subscription_date": start_date,
        "expiry_date": expiry_date,
        "amount": plan["price"],
        "is_renewal": random.choice([True, False]),
        "status": random.choice(['active', 'canceled', 'expired'])
    })
df_subs = pd.DataFrame(subscriptions)
df_subs.to_csv("subscriptions.csv", index=False)

# PAYMENTS
payments = []
for pay_id in range(1, NUM_PAYMENTS + 1):
    sub = random.choice(subscriptions)
    payments.append({
        "payment_id": pay_id,
        "user_id": sub["user_id"],
        "subscription_id": sub["subscription_id"],
        "payment_date": sub["subscription_date"],
        "amount": sub["amount"],
        "payment_method": random.choice(['credit_card', 'paypal']),
        "status": random.choice(['success', 'failed', 'refunded'])
    })
df_payments = pd.DataFrame(payments)
df_payments.to_csv("payments.csv", index=False)

# CANCELLATIONS
cancellations = []
for sub in subscriptions:
    if sub["status"] == "canceled":
        cancellations.append({
            "cancel_id": len(cancellations) + 1,
            "user_id": sub["user_id"],
            "subscription_id": sub["subscription_id"],
            "cancel_date": sub["expiry_date"],
            "cancel_reason": fake.sentence(),
            "churn_type": random.choice(['voluntary', 'involuntary'])
        })
df_cancellations = pd.DataFrame(cancellations)
df_cancellations.to_csv("cancellations.csv", index=False)

# USER LOGINS
logins = []
for login_id in range(1, NUM_LOGINS + 1):
    user_id = random.randint(1, NUM_USERS)
    logins.append({
        "login_id": login_id,
        "user_id": user_id,
        "login_timestamp": fake.date_time_between(start_date='-2y', end_date='now'),
        "device_type": random.choice(['iOS', 'Android', 'Web']),
        "ip_address": fake.ipv4()
    })
df_logins = pd.DataFrame(logins)
df_logins.to_csv("user_logins.csv", index=False)

# SUPPORT TICKETS
tickets = []
for ticket_id in range(1, NUM_TICKETS + 1):
    user_id = random.randint(1, NUM_USERS)
    created_at = fake.date_time_between(start_date='-2y', end_date='now')
    resolved_at = created_at + timedelta(days=random.randint(1, 10))
    tickets.append({
        "ticket_id": ticket_id,
        "user_id": user_id,
        "created_at": created_at,
        "resolved_at": resolved_at,
        "status": random.choice(['open', 'closed']),
        "category": random.choice(['billing', 'technical']),
        "feedback": fake.sentence()
    })
df_tickets = pd.DataFrame(tickets)
df_tickets.to_csv("support_tickets.csv", index=False)

# PROMOTIONS
promos = []
for pid in range(1, NUM_PROMOS + 1):
    start_date = fake.date_between(start_date='-1y', end_date='today')
    end_date = start_date + timedelta(days=random.randint(30, 180))
    promos.append({
        "promo_id": pid,
        "promo_code": fake.unique.bothify(text='PROMO-####'),
        "discount_percent": random.choice([10, 15, 20, 25, 30]),
        "start_date": start_date,
        "end_date": end_date,
        "applicable_plan_ids": [random.choice(PLAN_IDS) for _ in range(random.randint(1, 2))]
    })
df_promos = pd.DataFrame(promos)
df_promos.to_csv("promotions.csv", index=False)

# PROMO APPLICATIONS
promo_apps = []
for i in range(1, NUM_PROMOS * 20):
    user_id = random.randint(1, NUM_USERS)
    sub = random.choice(subscriptions)
    promo_id = random.randint(1, NUM_PROMOS)
    promo_apps.append({
        "id": i,
        "user_id": user_id,
        "subscription_id": sub["subscription_id"],
        "promo_id": promo_id,
        "applied_date": sub["subscription_date"]
    })
df_subpromo = pd.DataFrame(promo_apps)
df_subpromo.to_csv("subscription_promotions.csv", index=False)

# USER LTV
ltv_data = []
for user in users:
    total_revenue = round(random.uniform(10, 500), 2)
    months = random.randint(1, 24)
    ltv_data.append({
        "user_id": user["user_id"],
        "total_revenue": total_revenue,
        "avg_monthly_revenue": round(total_revenue / months, 2),
        "lifetime_months": months
    })
df_ltv = pd.DataFrame(ltv_data)
df_ltv.to_csv("user_ltv.csv", index=False)

# RETENTION SNAPSHOTS
snapshots = []
start = datetime.now() - timedelta(days=720)
for i in range(24):  # 2 years
    cohort = (start + timedelta(days=30 * i)).date().replace(day=1)
    for j in range(6):  # 6 months after
        snap = (cohort + timedelta(days=30 * j)).replace(day=1)
        cohort_size = random.randint(100, 500)
        retained = int(cohort_size * random.uniform(0.2, 0.8))
        snapshots.append({
            "cohort_month": cohort,
            "snapshot_month": snap,
            "cohort_size": cohort_size,
            "retained_users": retained
        })
df_snapshots = pd.DataFrame(snapshots)
df_snapshots.to_csv("retention_snapshots.csv", index=False)

print("✅ All CSVs generated successfully!")

# List all generated CSV files
csv_files = [
    "users.csv", "subscription_plans.csv", "subscriptions.csv", "payments.csv",
    "cancellations.csv", "user_logins.csv", "support_tickets.csv", "promotions.csv",
    "subscription_promotions.csv", "user_ltv.csv", "retention_snapshots.csv"
]

print("\nGenerated CSV files:")
for f in csv_files:
    print(f"- {f}")

# Zip all files into one archive
zip_filename = "churn_project_data.zip"
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for f in csv_files:
        zipf.write(f)

print(f"\n📦 All CSVs zipped into '{zip_filename}'")

# Provide download link
display(FileLink(zip_filename))


✅ All CSVs generated successfully!

Generated CSV files:
- users.csv
- subscription_plans.csv
- subscriptions.csv
- payments.csv
- cancellations.csv
- user_logins.csv
- support_tickets.csv
- promotions.csv
- subscription_promotions.csv
- user_ltv.csv
- retention_snapshots.csv

📦 All CSVs zipped into 'churn_project_data.zip'


In [5]:
from google.colab import files

files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>