<a href="https://colab.research.google.com/github/mariamcs/Customer_Churn/blob/main/Customer_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Creation Step**

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, when, round as spark_round

# Create Spark session
spark = SparkSession.builder.appName("SimulatedNetflixChurn").getOrCreate()

# Number of rows
n = 100000

# Generate DataFrame
df = (
    spark.range(0, n)
    .withColumn("daily_watch_minutes", spark_round(rand() * 300, 1))
    .withColumn("avg_session_length", spark_round(rand() * 90 + 10, 1))
    .withColumn("last_login_days_ago", (rand() * 60).cast("int"))
    .withColumn("binge_sessions_last_30d", (rand() * 10).cast("int"))
    .withColumn("completion_rate", spark_round(rand(), 2))

    .withColumn("plan_type", when(rand() < 0.6, "Standard").when(rand() < 0.85, "Premium").otherwise("Basic"))
    .withColumn("tenure_months", (rand() * 48).cast("int"))
    .withColumn("price_per_hour_watched", spark_round(rand() * 0.5 + 0.2, 2))
    .withColumn("billing_failures_last_90d", (rand() * 3).cast("int"))
    .withColumn("upgrades_last_6mo", (rand() * 2).cast("int"))

    .withColumn("has_kids_profile", when(rand() < 0.3, 1).otherwise(0))
    .withColumn("uses_download_feature", when(rand() < 0.5, 1).otherwise(0))
    .withColumn("simultaneous_streams_used", (rand() * 4 + 1).cast("int"))
    .withColumn("primary_device_type", when(rand() < 0.4, "Smart TV").when(rand() < 0.7, "Mobile").otherwise("Laptop"))
    .withColumn("geo_consistency_score", spark_round(rand(), 2))

    .withColumn("support_tickets_last_6mo", (rand() * 5).cast("int"))
    .withColumn("cancel_reason_code", when(rand() < 0.1, "Pricing").when(rand() < 0.2, "Content").when(rand() < 0.3, "Tech Issues").otherwise("None"))
    .withColumn("issue_resolution_time_avg", spark_round(rand() * 48, 1))

    .withColumn("churned", when(rand() < 0.2, 1).otherwise(0))
)

df.show(5)


+---+-------------------+------------------+-------------------+-----------------------+---------------+---------+-------------+----------------------+-------------------------+-----------------+----------------+---------------------+-------------------------+-------------------+---------------------+------------------------+------------------+-------------------------+-------+
| id|daily_watch_minutes|avg_session_length|last_login_days_ago|binge_sessions_last_30d|completion_rate|plan_type|tenure_months|price_per_hour_watched|billing_failures_last_90d|upgrades_last_6mo|has_kids_profile|uses_download_feature|simultaneous_streams_used|primary_device_type|geo_consistency_score|support_tickets_last_6mo|cancel_reason_code|issue_resolution_time_avg|churned|
+---+-------------------+------------------+-------------------+-----------------------+---------------+---------+-------------+----------------------+-------------------------+-----------------+----------------+---------------------+----

# **Data Cleaning Step**

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, rand
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
# Data Cleaning
df_clean = df.na.drop()

# Categorical feature encoding
categorical_cols = ["plan_type", "primary_device_type", "cancel_reason_code"]
indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in categorical_cols]
encoders = [OneHotEncoder(inputCol=c+"_idx", outputCol=c+"_vec") for c in categorical_cols]

# Assemble features
numeric_cols = [col for col in df.columns if col not in categorical_cols + ["churned"]]
all_features = numeric_cols + [c + "_vec" for c in categorical_cols]
assembler = VectorAssembler(inputCols=all_features, outputCol="features")

# Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model = pipeline.fit(df_clean)
df_ready = model.transform(df_clean)

# Final output
df_ready.select("features", "churned").show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------+-------+
|features                                                                                                                |churned|
+------------------------------------------------------------------------------------------------------------------------+-------+
|[0.0,9.8,81.6,56.0,6.0,0.14,38.0,0.56,2.0,1.0,1.0,0.0,4.0,0.25,1.0,34.2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0]        |0      |
|(26,[0,1,2,3,4,5,7,8,9,12,13,14,15,16,21,24],[1.0,141.9,67.1,56.0,2.0,0.24,0.31,1.0,1.0,4.0,0.15,2.0,39.3,1.0,1.0,1.0]) |0      |
|(26,[0,1,2,3,4,5,6,7,8,12,13,14,15,17,20,22],[2.0,58.8,76.9,54.0,8.0,0.45,12.0,0.32,2.0,3.0,0.77,2.0,41.0,1.0,1.0,1.0]) |0      |
|(26,[0,1,2,3,4,5,6,7,8,11,12,13,15,18,20,22],[3.0,189.8,64.3,30.0,4.0,0.45,45.0,0.67,2.0,1.0,4.0,0.94,14.5,1.0,1.0,1.0])|1      |
|[4.0,107.1,48.2,52.0,5.0,0.35,5.0,0.41,1.0,1.0,0.0,1.0,3.0,0.5,3.0,42.5,0.0,1.0,0.