In [1]:
import pandas as pd
import numpy as np

# Load the original dataset
df = pd.read_csv("../data/train.csv")  # Change the filename accordingly

# Ensure dataset has exactly 10,000 rows
df = df.sample(n=10000, random_state=42) if len(df) > 10000 else df

# Feature Engineering
df["Avg_Weekly_Viewing_Hours"] = df["ViewingHoursPerWeek"] / 7
df["High_Content_Downloads"] = df["ContentDownloadsPerMonth"].apply(lambda x: 1 if x > 10 else 0)
df["Avg_Support_Tickets"] = df["SupportTicketsPerMonth"] / 4  # Weekly tickets
df["Prime_Membership"] = df["SubscriptionType"].apply(lambda x: 1 if "Prime" in x else 0)
df["Late_Payments"] = df["PaymentMethod"].apply(lambda x: np.random.randint(0, 3) if x == "Credit Card" else 0)
df["Subscription_Cancelled"] = np.random.choice([0, 1], size=len(df), p=[0.85, 0.15])  # 15% churn simulation
df["Negative_Feedback"] = np.random.randint(0, 10, size=len(df))  # Fake feedback counts
df["Support_Interactions"] = df["SupportTicketsPerMonth"] + np.random.randint(0, 3, size=len(df))
df["Avg_Resolution_Time"] = np.random.randint(1, 24, size=len(df))  # Resolution time in hours
df["Email_CTR"] = np.random.uniform(0.01, 0.2, size=len(df))  # Click-through rate

df["Search_Frequency"] = np.random.randint(0, 15, size=len(df))  # Number of searches per week
df["Exit_Without_Watching"] = np.random.randint(0, 10, size=len(df))  # Number of exits without watching
df["Recommendation_Engagement"] = np.random.choice([0, 1], size=len(df), p=[0.6, 0.4])  # 40% engage with recommendations

# Binge-Watching Patterns
df["Binge_Watcher"] = np.where(df["ViewingHoursPerWeek"] / df["Avg_Weekly_Viewing_Hours"] > 1.5, 1, 0)

# Skipped Content & Abandoned Shows
df["Abandoned_Shows"] = np.random.randint(0, 5, size=len(df))  # Shows started but not completed
df["Skipped_Content_Percentage"] = np.random.uniform(0, 50, size=len(df))  # % of content skipped

# Ad-Interaction (For Ad-Supported Plans)
df["Ad_Skipped"] = np.random.randint(0, 5, size=len(df))  # Number of ads skipped
df["Ad_Engagement"] = np.random.choice([0, 1], size=len(df), p=[0.7, 0.3])  # 30% interact with ads

# Financial & Pricing Factors
df["Refund_Requests"] = np.random.randint(0, 3, size=len(df))  # Number of refund requests
df["Downgraded_Plan"] = np.random.choice([0, 1], size=len(df), p=[0.85, 0.15])  # 15% users downgraded their plan
df["Auto_Renewal_Off"] = np.random.choice([0, 1], size=len(df), p=[0.8, 0.2])  # 20% users have auto-renewal turned off

# Competitor Influence & External Factors
df["Uses_Competitor_Platforms"] = np.random.choice([0, 1], size=len(df), p=[0.7, 0.3])  # 30% users also use Netflix, etc.
df["Internet_Speed_MBPS"] = np.random.randint(5, 200, size=len(df))  # Internet speed in Mbps
df["Primary_Device"] = np.random.choice(["Mobile", "Laptop", "Smart TV", "Tablet"], size=len(df))  # Primary streaming device

# Drop unnecessary columns
df.drop(columns=["CustomerID"], inplace=True, errors="ignore")

# Save the merged dataset
df.to_csv("../data/merged_train.csv", index=False)

print(f"Merged dataset saved successfully as 'merged_train.csv' with {len(df)} rows.")

Merged dataset saved successfully as 'merged_train.csv' with 10000 rows.
