In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=53b2e8f64b8d239296355a9ee42f7281f289dab72c665707af5b68ab634c3132
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [60]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.session import SparkSession


In [52]:
# Create a SparkSession
spark = SparkSession.builder.appName("TelecomChurnPrediction").getOrCreate()

In [61]:
# Load the CSV file into a DataFrame
df = spark.read.csv("telecom_dataset.csv", header=True, inferSchema=True)

In [None]:
# Convert string columns to numerical categories
indexers = [
    StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
    for col in ['Gender', 'Contract', 'Churn']
]
indexers.append(StringIndexer(inputCol="CustomerID", outputCol="label", handleInvalid="keep"))
indexer_pipeline = Pipeline(stages=indexers)
indexed_data = indexer_pipeline.fit(df).transform(df)
df.show()

In [None]:
# Select Features and label
feature_columns = ['Gender_index', 'Age', 'Contract_index', 'MonthlyCharges', 'TotalCharges']
label_column = "label"

# Vectorize Features
vectorAssembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
v_data = vectorAssembler.transform(indexed_data)

# Split the Data for Training and Testing
splits = v_data.randomSplit([0.8, 0.2])
train_data = splits[0]
test_data = splits[1]

# Create Logistic Regression model
lr = LogisticRegression(labelCol=label_column, featuresCol="features")

# Train the model
model = lr.fit(train_data)

# Test the model
predictions = model.transform(test_data)

# Evaluate the model
predictions.select("CustomerID", "prediction").show()