<a href="https://colab.research.google.com/github/mariamcs/Customer_Churn/blob/main/Create_data_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📌 Netflix Churn Prediction with XGBoost + SHAP
# There’s no publicly available Netflix user-level churn dataset with detailed behavioral,
# billing, and engagement features due to privacy and business confidentiality.

# Create the dataset
import pandas as pd
import numpy as np

np.random.seed(42)

n = 5000  # Number of users

# Simulate subscription and billing data
data = {
    "user_id": np.arange(1, n+1),
    "tenure_months": np.random.randint(1, 48, n),
    "plan_type": np.random.choice(["Basic", "Standard", "Premium"], n, p=[0.3, 0.5, 0.2]),
    "billing_failures_last_90d": np.random.poisson(0.2, n),
    "upgrades_last_6mo": np.random.poisson(0.5, n),
    "price_per_month": np.random.choice([8.99, 13.99, 17.99], n, p=[0.3, 0.5, 0.2]),
}

# Engagement features
data.update({
    "daily_watch_minutes": np.random.normal(90, 30, n).clip(0),
    "avg_session_length": np.random.normal(35, 10, n).clip(5),
    "last_login_days_ago": np.random.exponential(10, n).astype(int),
    "binge_sessions_last_30d": np.random.poisson(4, n),
    "completion_rate": np.random.beta(2, 5, n),  # 0 to 1
})

# Device and usage
data.update({
    "has_kids_profile": np.random.choice([0, 1], n, p=[0.7, 0.3]),
    "uses_download_feature": np.random.choice([0, 1], n, p=[0.6, 0.4]),
    "primary_device_type": np.random.choice(["SmartTV", "Mobile", "Browser", "Tablet"], n),
    "geo_consistency_score": np.random.uniform(0.5, 1.0, n),
})

# Support + Satisfaction
data.update({
    "support_tickets_last_6mo": np.random.poisson(0.3, n),
    "issue_resolution_time_avg": np.random.normal(1.5, 0.5, n).clip(0.5, 5),
    "csat_score": np.random.normal(3.8, 0.7, n).clip(1, 5),  # customer satisfaction
})

# Target: churn label (simulate based on known churn patterns)
df = pd.DataFrame(data)

# Build churn probabilities
df["churn_score"] = (
    0.3 * (df["last_login_days_ago"] > 14).astype(int) +
    0.2 * (df["billing_failures_last_90d"] > 0).astype(int) +
    0.2 * (df["completion_rate"] < 0.3).astype(int) +
    0.2 * (df["support_tickets_last_6mo"] > 1).astype(int) +
    0.1 * (df["tenure_months"] < 6).astype(int)
)

df["churned"] = df["churn_score"].apply(lambda x: np.random.binomial(1, min(x, 0.8)))
df.drop(columns="churn_score", inplace=True)

df.head()



Unnamed: 0,user_id,tenure_months,plan_type,billing_failures_last_90d,upgrades_last_6mo,price_per_month,daily_watch_minutes,avg_session_length,last_login_days_ago,binge_sessions_last_30d,completion_rate,has_kids_profile,uses_download_feature,primary_device_type,geo_consistency_score,support_tickets_last_6mo,issue_resolution_time_avg,csat_score,churned
0,1,39,Standard,0,0,13.99,137.882679,57.057827,15,3,0.231643,0,0,Tablet,0.817104,0,2.118714,3.398605,1
1,2,29,Premium,0,1,13.99,73.664156,40.66445,6,11,0.091527,0,0,SmartTV,0.57713,0,1.062175,4.149573,0
2,3,15,Standard,0,0,17.99,60.035542,35.249235,3,7,0.332622,1,0,Tablet,0.900623,0,1.946138,4.767912,0
3,4,43,Standard,2,2,8.99,52.735399,43.199008,0,3,0.449748,0,1,Mobile,0.937715,0,0.665964,3.015475,0
4,5,8,Standard,0,0,13.99,137.793187,30.193196,7,1,0.352056,0,1,Tablet,0.760536,0,0.740912,3.884723,0
