In [3]:
pip install pandas numpy faker jupyter

Collecting faker
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook->jupyter)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook->jupyter)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook->jupyter)
  Downloading uri_template-1.3.0-py3-none-any.whl.metadata (8.8 kB)
Collecting webcolors>=1.11 (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook->jupyter)
  Downloading webcolors-24.11.1-py3-none-any.whl.metadata (2.2 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------

In [1]:
# --- 1. Import Necessary Libraries ---
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime

# --- 2. Initialize and Configure ---
# Initialize the Faker library to generate fake data
fake = Faker()

# Set the number of subscribers you want to generate
num_users = 10000

print(f"Starting data generation for {num_users} subscribers...")

# Create an empty list to store each user's data
data = []

# --- 3. The Main Generation Loop ---
# This loop runs once for each subscriber we want to create
for _ in range(num_users):
    
    # --- A. Basic Subscriber Attributes ---
    country = random.choice(['Singapore', 'Malaysia', 'Indonesia', 'Thailand', 'Philippines', 'Vietnam'])
    plan_type = random.choice(['Basic', 'Standard', 'Premium'])
    age = random.randint(18, 65)
    subscription_date = fake.date_between(start_date='-2y', end_date='today')
    
    # --- B. Feature Engineering: Calculate Tenure ---
    # Calculate the number of months from subscription to now
    tenure_months = (datetime.now().year - subscription_date.year) * 12 + (datetime.now().month - subscription_date.month)
    # Ensure tenure is at least 1 month for new sign-ups
    if tenure_months <= 0:
        tenure_months = 1
        
    # --- C. Simulating User Engagement ---
    # Generate weekly watch hours from a normal (bell-curve) distribution
    # Average (loc) is 10 hours, standard deviation (scale) is 5 hours
    watch_hours_per_week = round(np.random.normal(loc=10, scale=5), 1)
    # Ensure watch hours are not negative
    if watch_hours_per_week < 0:
        watch_hours_per_week = 0.5

    # --- D. Simulating Churn with Business Logic ---
    # Start with a base churn probability for a loyal user
    churn_probability = 0.1

    # Rule 1: Tenure is the biggest factor
    if tenure_months < 3:
        churn_probability += 0.4  # High risk for new users
    elif tenure_months < 12:
        churn_probability += 0.15 # Medium risk for users under a year

    # Rule 2: Low engagement indicates churn risk
    if watch_hours_per_week < 5:
        churn_probability += 0.25
        
    # Rule 3: Basic plan users may be slightly less committed
    if plan_type == 'Basic':
        churn_probability += 0.05
    
    # Decide if the user churns based on the final probability
    # 1 = Churned, 0 = Did Not Churn
    churn = 1 if random.random() < churn_probability else 0

    # --- E. Assemble the Data for One User ---
    # Add the generated user's data as a dictionary to our list
    data.append({
        'user_id': fake.uuid4(),
        'age': age,
        'country': country,
        'plan_type': plan_type,
        'subscription_date': subscription_date,
        'tenure_months': tenure_months,
        'watch_hours_per_week': watch_hours_per_week,
        'churn': churn
    })

# --- 4. Final Conversion and Export ---
# Convert the list of user data into a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# index=False prevents pandas from saving the row numbers as a column
df.to_csv('disney_plus_sea_subscribers.csv', index=False)

print("\nDataset generation complete!")
print(f"File saved as 'disney_plus_sea_subscribers.csv'")
print("\n--- Data Preview ---")
print(df.head())

Starting data generation for 10000 subscribers...

Dataset generation complete!
File saved as 'disney_plus_sea_subscribers.csv'

--- Data Preview ---
                                user_id  age    country plan_type  \
0  4fd7c071-b749-42b5-b624-322dc97c457f   27  Singapore     Basic   
1  b3e5b494-b852-4164-b870-58a874d91eb2   53   Thailand   Premium   
2  e215f0a5-ab4b-4353-84fc-8a801f7b12ca   29  Singapore  Standard   
3  fdac7b34-3532-416a-8cb4-3bab2b96c88a   27  Singapore   Premium   
4  31430913-97d1-4fe2-b8af-86675a458c50   38   Malaysia     Basic   

  subscription_date  tenure_months  watch_hours_per_week  churn  
0        2024-04-16             17                   6.5      0  
1        2023-11-02             22                  12.0      0  
2        2025-06-06              3                  15.6      0  
3        2024-07-11             14                   8.1      0  
4        2024-12-06              9                  11.4      1  
