## Customer Churn Model Scoring
#### The objectives of this lab is:
- score **new** customer data against a pre-built model
- schedule the notebook to run via the Notebook scheduler

### Step 1: Download new customer data

In [1]:
import wget
url_customer='https://raw.githubusercontent.com/yfphoon/dsx_demo/master/data/new_customer_churn_data.csv'

#remove existing files before downloading
!rm -f new_customer_churn_data.csv

customerFilename=wget.download(url_customer)

!ls -l new_customer_churn_data.csv

-rw------- 1 s3b2-c7634938ff52ab-a5f39cf201a0 users 27597 Nov 15 17:53 new_customer_churn_data.csv


### Step 2: Read data into a Spark DataFrame
**Note**: the new dataset does not contain the label column

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

newData= sqlContext.read\
    .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(customerFilename)

In [3]:
newData = newData.withColumnRenamed("Est Income", "EstIncome").withColumnRenamed("Car Owner","CarOwner")
newData.toPandas().head()

Unnamed: 0,ID,Gender,Status,Children,EstIncome,CarOwner,Age,LongDistance,International,Local,Dropped,Paymethod,LocalBilltype,LongDistanceBilltype,Usage,RatePlan
0,2048,F,S,1,13576.5,N,39.426667,14.83,0,25.66,0,CC,Budget,Standard,40.49,1
1,2054,F,M,2,84166.1,N,54.013333,3.28,0,11.74,1,CC,Budget,Standard,15.02,2
2,2075,F,S,0,68427.4,N,42.393333,23.76,0,50.05,0,Auto,FreeLocal,Standard,73.81,3
3,2095,F,M,2,77551.1,Y,33.6,20.53,0,41.89,1,CC,Budget,Intnl_discount,62.42,2
4,2108,F,S,1,13109.1,N,62.606667,22.38,0,40.48,0,Auto,Budget,Standard,62.87,1


### Step 3: Load Saved Model

In [4]:
from pyspark.ml import PipelineModel
model1_loaded = PipelineModel.load("PredictChurn.churnModel")

### Step 4: Score the new data
Note: The scored output contains the predicted values and confidence scores

In [5]:
result = model1_loaded.transform(newData)

### Step 5: Export Score into a csv file

In [6]:
#Select ID, prediction and probability fields from the result dataframe

r1=result.select(result["ID"],result["predictedLabel"],result["prediction"],result["probability"])
r1.toPandas().head(5)

Unnamed: 0,ID,predictedLabel,prediction,probability
0,2048,T,1,"[0.0173913043478, 0.982608695652]"
1,2054,T,1,"[0.393257002801, 0.606742997199]"
2,2075,F,0,"[0.942270779314, 0.0577292206856]"
3,2095,F,0,"[0.976372888549, 0.0236271114506]"
4,2108,T,1,"[0.075, 0.925]"


#### Decompose the probability column
The probability column contains a vector for each record, and the elements must be extracted

In [7]:
from pyspark.sql import Row
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors

udf_0 = udf(lambda vector: float(vector[0]), DoubleType())
udf_1 = udf(lambda vector: float(vector[1]), DoubleType())

r2 = (r1.select(r1["ID"], r1["prediction"],r1["probability"])
    .withColumn('probability_0', udf_0(r1.probability))
    .withColumn('probability_1', udf_1(r1.probability))
    .drop("probability"))

r2.toPandas().head(10)

Unnamed: 0,ID,prediction,probability_0,probability_1
0,2048,1,0.017391,0.982609
1,2054,1,0.393257,0.606743
2,2075,0,0.942271,0.057729
3,2095,0,0.976373,0.023627
4,2108,1,0.075,0.925
5,2124,0,0.991906,0.008094
6,2154,1,0.169812,0.830188
7,2218,0,0.914364,0.085636
8,2267,0,0.980574,0.019426
9,2284,1,0.088492,0.911508
