In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('logregcunsulting').getOrCreate()

In [2]:
df = spark.read.format('csv').load('/FileStore/tables/Machine_learning/customer_churn.csv', inferschema = True, header = True)

In [3]:
#df.printSchema()
df.columns

In [4]:
assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'], outputCol ='features')
output = assembler.transform(df)

In [5]:
final_data = output.select('features', 'churn')

In [6]:
train_set, test_set = final_data.randomSplit([0.7, 0.3])

In [7]:
lr_churn = LogisticRegression(labelCol='churn')

In [8]:
fitted_churn_model = lr_churn.fit(train_set)

In [9]:
training_sum = fitted_churn_model.summary
training_sum.predictions.describe().show()

In [10]:
pred_and_labels = fitted_churn_model.evaluate(test_set)

In [11]:
pred_and_labels.predictions.show()

In [12]:
churn_eval = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='prediction')

In [13]:
auc = churn_eval.evaluate(pred_and_labels.predictions)
auc

In [14]:
final_lr_model = lr_churn.fit(final_data)

In [15]:
new_customers = spark.read.format('csv').load('/FileStore/tables/Machine_learning/new_customers.csv', inferschema = True, header = True)

In [16]:
test_new_customers = assembler.transform(new_customers)

In [17]:
test_new_customers.printSchema()

In [18]:
final_results = final_lr_model.transform(test_new_customers)

In [19]:
final_results.select('Company', 'prediction').show()