In [0]:
#read data
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("churn").getOrCreate()
df = spark.read.options(inferSchema=True,header=True).csv("/FileStore/tables/customer_churn.csv")
df.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [0]:
#get important columns
my_cols = df.select(['Age',
 'Total_Purchase', 'Years',
 'Num_Sites','Account_Manager'])

In [0]:
#vector assembling 
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=my_cols.columns, outputCol='features')
df2 = assembler.transform(dataset=df)
df2 = df2.select('features','Churn')

In [0]:
#Write Logistic Regression Model
from pyspark.ml.classification import LogisticRegression
train_data,test_data = df2.randomSplit([0.7,0.3])
log_reg = LogisticRegression(labelCol='Churn')
model = log_reg.fit(train_data)

In [0]:
#Binary Classification Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions_labels = model.evaluate(test_data)
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
auc = eval.evaluate(predictions_labels.predictions)
auc

Out[95]: 0.7837695230054875

In [0]:
#Predict on new data
new_customers = spark.read.options(inferSchema=True,header=True).csv("/FileStore/tables/new_customers.csv")
test_new_customers = assembler.transform(new_customers)
final_model = log_reg.fit(df2)
results = final_model.transform(test_new_customers)
results.select('Company','Prediction').show()

+----------------+----------+
|         Company|Prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

