In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('consultancy_logreg').getOrCreate()

In [15]:
input_file_path="file:///C:/Users/ckp43_000/Documents/customer_churn.csv"

In [16]:
data=spark.read.csv(input_file_path,inferSchema=True,header=True)

In [17]:
data.count()

900

In [18]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [19]:
data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|       Onboard_date|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|               null|                null|                null|0.16666666666666666|
| stddev| 

In [20]:
data.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

In [22]:
data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [53]:
from pyspark.ml.feature import VectorAssembler

In [54]:
assembler=VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',],outputCol='features')

In [25]:
output=assembler.transform(data)

In [27]:
output.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|            features|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|[42.0,11066.8,0.0...|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|[41.0,11916.22,0....|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|[38.0,12884.75,0....|
|   Phillip White|42.0|       8010.76|  

In [66]:
final_data=output.select('features','Churn')

In [31]:
final_data.show(5)

+--------------------+-----+
|            features|Churn|
+--------------------+-----+
|[42.0,11066.8,0.0...|    1|
|[41.0,11916.22,0....|    1|
|[38.0,12884.75,0....|    1|
|[42.0,8010.76,0.0...|    1|
|[37.0,9191.58,0.0...|    1|
+--------------------+-----+
only showing top 5 rows



In [33]:
train_churn,test_churn=final_data.randomSplit([0.7,0.3])

In [55]:
from pyspark.ml.classification import LogisticRegression

In [67]:
lr_churn=LogisticRegression(labelCol='Churn')

In [36]:
fitted_churn_model=lr_churn.fit(train_churn)

In [37]:
training_summary=fitted_churn_model.summary

In [38]:
training_summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              Churn|         prediction|
+-------+-------------------+-------------------+
|  count|                636|                636|
|   mean|0.16823899371069181|0.12421383647798742|
| stddev| 0.3743728144635767|0.33008494950086814|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [56]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [40]:
pred_and_labels=fitted_churn_model.evaluate(test_churn)

In [41]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8939.61,0.0...|    0|[6.43487911990347...|[0.99839796796964...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.25593432030239...|[0.77832542975536...|       0.0|
|[28.0,11128.95,1....|    0|[4.18081069802871...|[0.98494403698586...|       0.0|
|[28.0,11204.23,0....|    0|[1.80448776355767...|[0.85869435004261...|       0.0|
|[28.0,11245.38,0....|    0|[3.73326863878241...|[0.97664400785413...|       0.0|
|[29.0,10203.18,1....|    0|[3.69242165810051...|[0.97569390226667...|       0.0|
|[29.0,11274.46,1....|    0|[4.55114481965237...|[0.98955513299674...|       0.0|
|[29.0,13255.05,1....|    0|[4.27875305021662...|[0.98632953774668...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.30553902496131...|[0.96461834379656...|       0.0|
|[30.0,8677.28,1

In [42]:
churn_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')

In [43]:
auc=churn_eval.evaluate(pred_and_labels.predictions)

In [44]:
auc

0.7845417236662107

## predict on new data

In [48]:
new_data=spark.read.csv("file:///C:/Users/ckp43_000/Documents/new_customer_churn.csv",inferSchema=True,header=True)

In [50]:
new_data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [68]:
final_lr_model=lr_churn.fit(final_data)

In [57]:
test_new_data=assembler.transform(new_data)

In [58]:
test_new_data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- features: vector (nullable = true)



In [69]:
final_result=final_lr_model.transform(test_new_data)

In [70]:
final_result.show(6)

+-----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|            Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|            features|       rawPrediction|         probability|prediction|
+-----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|Cameron Carpenter|43.0|      11226.88|              0| 8.08|     12.0|2006-10-22 04:42:38|Unit 4948 Box 481...|Morgan, Phillips ...|    1|[43.0,11226.88,0....|[-2.7600916420616...|[0.05951923594954...|       1.0|
|   Lindsay Martin|53.0|       5515.09|              0| 6.85|      8.0|2015-10-07 00:27:10|69203 Crosby Divi...|      Villanueva LLC|    1|[53.0

In [72]:
final_result.select('company','prediction',).show(5)

+--------------------+----------+
|             company|prediction|
+--------------------+----------+
|Morgan, Phillips ...|       1.0|
|      Villanueva LLC|       0.0|
|Berry, Orr and Ca...|       0.0|
|       Parks-Bradley|       1.0|
|           Olsen LLC|       1.0|
+--------------------+----------+
only showing top 5 rows

