In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName('cust').getOrCreate()

In [3]:
customer = spark.read.csv('file:///home/manohar_l/Manohar_Personal_Projects/customer_churn.csv', inferSchema=True,header=True)

In [6]:
customer.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [5]:
customer.head(1)

[Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date=datetime.datetime(2013, 8, 30, 7, 0, 40), Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)]

In [10]:
my_cols = customer.select(['Age', 'Total_Purchase', 'Years', 'Num_Sites','Churn'])
my_cols.describe().show()

+-------+-----------------+-----------------+-----------------+------------------+-------------------+
|summary|              Age|   Total_Purchase|            Years|         Num_Sites|              Churn|
+-------+-----------------+-----------------+-----------------+------------------+-------------------+
|  count|              900|              900|              900|               900|                900|
|   mean|41.81666666666667|10062.82403333334| 5.27315555555555| 8.587777777777777|0.16666666666666666|
| stddev|6.127560416916251|2408.644531858096|1.274449013194616|1.7648355920350969| 0.3728852122772358|
|    min|             22.0|            100.0|              1.0|               3.0|                  0|
|    max|             65.0|         18026.01|             9.15|              14.0|                  1|
+-------+-----------------+-----------------+-----------------+------------------+-------------------+



In [11]:
my_final_data = my_cols.na.drop()
my_final_data.describe().show()

+-------+-----------------+-----------------+-----------------+------------------+-------------------+
|summary|              Age|   Total_Purchase|            Years|         Num_Sites|              Churn|
+-------+-----------------+-----------------+-----------------+------------------+-------------------+
|  count|              900|              900|              900|               900|                900|
|   mean|41.81666666666667|10062.82403333334| 5.27315555555555| 8.587777777777777|0.16666666666666666|
| stddev|6.127560416916251|2408.644531858096|1.274449013194616|1.7648355920350969| 0.3728852122772358|
|    min|             22.0|            100.0|              1.0|               3.0|                  0|
|    max|             65.0|         18026.01|             9.15|              14.0|                  1|
+-------+-----------------+-----------------+-----------------+------------------+-------------------+



In [17]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline

In [15]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 'Num_Sites'], outputCol='features')

In [16]:
lr_customer = LogisticRegression(featuresCol='features', labelCol='Churn')

In [18]:
pipeline = Pipeline(stages=[assembler,lr_customer])

In [19]:
train_data, test_data = my_final_data.randomSplit([0.7,0.3])

In [20]:
train_model = pipeline.fit(train_data)

In [21]:
test_model = train_model.transform(test_data)

In [22]:
test_model.show()

+----+--------------+-----+---------+-----+--------------------+--------------------+--------------------+----------+
| Age|Total_Purchase|Years|Num_Sites|Churn|            features|       rawPrediction|         probability|prediction|
+----+--------------+-----+---------+-----+--------------------+--------------------+--------------------+----------+
|25.0|       9672.03| 5.49|      8.0|    0|[25.0,9672.03,5.4...|[4.21368923892380...|[0.98542390770012...|       0.0|
|26.0|       8787.39| 5.42|     11.0|    1|[26.0,8787.39,5.4...|[0.46840268087399...|[0.61500562276468...|       0.0|
|28.0|       8670.98| 3.99|      6.0|    0|[28.0,8670.98,3.9...|[7.43066078731126...|[0.99940755552791...|       0.0|
|28.0|      11204.23| 3.67|     11.0|    0|[28.0,11204.23,3....|[1.50048895074551...|[0.81764739013917...|       0.0|
|28.0|      11245.38| 6.72|      8.0|    0|[28.0,11245.38,6....|[3.38816136282151...|[0.96733249302902...|       0.0|
|29.0|       8688.17|  5.7|      9.0|    1|[29.0,8688.17

In [23]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [24]:
customer_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')

In [26]:
customer_eval.evaluate(test_model)

0.7378327207929614

In [37]:
customer_eval1 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Churn', metricName='accuracy')

In [38]:
customer_eval1.evaluate(test_model)

0.8923076923076924