In [61]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Total subscribers to simulate
num_subscribers = 30000  # Total subscribers

# Initial tier distribution 
initial_tiers = ["Basic"] * 20000 + ["Standard"] * 7000 + ["Premium"] * 3000

# Tier pricing
price_mapping = {"Basic": 9.99, "Standard": 14.99, "Premium": 19.99}

# Simulation period
end_date = datetime.today()
start_date_range = datetime(2018, 1, 1)


# Churn rates,  increasing over time
def get_churn_rate(tier, month):
    if tier == "Basic":
        return min(0.12 + (month // 6) * 0.02, 0.2)
    elif tier == "Standard":
        return min(0.08 + (month // 6) * 0.015, 0.15)
    elif tier == "Premium":
        return min(0.06 + (month // 6) * 0.01, 0.1)
    return 0.05

def generate_subscriber_data():
    data = []
    for user_id in range(1, num_subscribers + 1):
        sub_id_counter = 1
        tier = random.choice(initial_tiers)
        current_start = start_date_range + timedelta(days=random.randint(0, (end_date - start_date_range).days))
        
        while current_start < end_date:
            churn_rate = get_churn_rate(tier, (current_start - start_date_range).days // 30)
            subscription_duration = 1  # Monthly only
            subscription_end_date = current_start + timedelta(days=subscription_duration * 30)
            
            # Churn decision
            churned = np.random.rand() < churn_rate
            if churned and np.random.rand() < 0.50:  # 50% of churned users never return
                break
            if churned:
                churn_days = random.randint(14, 120)  # If returning, then 14 days minimum churn period 
                next_start = subscription_end_date + timedelta(days=churn_days)
            else:
                next_start = subscription_end_date  # Immediate renewal if not churned
            
            sub_id = f"{tier.upper()}-{user_id:04d}-{sub_id_counter}"
            sub_id_counter += 1
            
            data.append({
                'user_id': user_id,
                'sub_id': sub_id,
                'start': current_start.strftime('%Y-%m-%d'),
                'end': subscription_end_date.strftime('%Y-%m-%d'),
                'tier': tier,
                'price': price_mapping[tier],
                'duration': subscription_duration
            })
            
            # Tier change logic
            if np.random.rand() < 0.10:  # 10% chance of switching tiers
                tier = random.choice(["Basic", "Standard", "Premium"])
            
            current_start = next_start  # Ensure no overlapping subscriptions
            
    return data

data = [] 
data = generate_subscriber_data()

In [63]:

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("subscription_raw_data_with_gaps.csv", index=False)

df.head(20)




Unnamed: 0,user_id,sub_id,start,end,tier,price,duration
0,1,BASIC-0001-1,2018-03-07,2018-04-06,Basic,9.99,1
1,1,BASIC-0001-2,2018-04-06,2018-05-06,Basic,9.99,1
2,2,BASIC-0002-1,2020-08-14,2020-09-13,Basic,9.99,1
3,2,BASIC-0002-2,2020-09-13,2020-10-13,Basic,9.99,1
4,3,BASIC-0003-1,2020-03-09,2020-04-08,Basic,9.99,1
5,3,BASIC-0003-2,2020-04-08,2020-05-08,Basic,9.99,1
6,3,BASIC-0003-3,2020-05-08,2020-06-07,Basic,9.99,1
7,3,BASIC-0003-4,2020-06-07,2020-07-07,Basic,9.99,1
8,3,BASIC-0003-5,2020-09-22,2020-10-22,Basic,9.99,1
9,3,BASIC-0003-6,2020-10-22,2020-11-21,Basic,9.99,1
