In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import re
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import to_date, datediff
from pyspark.sql.functions import concat, lit, avg, split, isnan, when, count, col, sum, mean, stddev, min, max, round
from pyspark.sql import Window
from pyspark.ml.classification import LogisticRegression, GBTClassifier, NaiveBayes, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import Bucketizer



In [2]:
# Creating a spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("ModelBuilding") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/15 08:14:29 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/12/15 08:14:29 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/12/15 08:14:29 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/12/15 08:14:29 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


## 1.) Data Loading

In [3]:
#reading in the dataframe from GCS bucket
df = spark.read.format("csv").options(header="false", inferschema="true").load('gs://bda-project-6893/Processed_data.csv/part-00000-8c39344c-28d0-4c2e-935d-9f0d1e5c496e-c000.csv')

                                                                                

In [4]:
df = df.withColumnRenamed("_c0", "userId")\
       .withColumnRenamed("_c1", "gender")\
       .withColumnRenamed("_c2", "churn")\
       .withColumnRenamed("_c3", "last_level")\
       .withColumnRenamed("_c4", "days_active")\
       .withColumnRenamed("_c5", "last_state")\
       .withColumnRenamed("_c6", "avg_songs")\
       .withColumnRenamed("_c7", "avg_events")\
       .withColumnRenamed("_c8", "thumbs_up")\
       .withColumnRenamed("_c9", "thumbs_down")\
       .withColumnRenamed("_c10", "addfriend")

In [5]:
df.show(10)

+------+------+-----+----------+-----------+----------+---------+----------+---------+-----------+---------+
|userId|gender|churn|last_level|days_active|last_state|avg_songs|avg_events|thumbs_up|thumbs_down|addfriend|
+------+------+-----+----------+-----------+----------+---------+----------+---------+-----------+---------+
|    10|     M|    0|      paid|         42|        MS|    84.13|     99.38|       37|          4|       12|
|   100|     M|    0|      paid|         59|        TX|    81.27|     97.39|      148|         27|       49|
|100001|     F|    1|      free|          1|        FL|     66.5|      93.5|        8|          2|        2|
|100002|     F|    0|      paid|         56|        CA|     39.0|      43.6|        5|          0|        1|
|100003|     F|    1|      free|          2|        FL|     25.5|      39.0|        3|          0|        0|
|100004|     F|    0|      paid|         57|        NY|    49.58|     65.53|       35|         11|       19|
|100005|     M|    

## 2.) Data Preparation

In [6]:
# Split data into train, validation and test sets
df_ml = df.withColumnRenamed("churn", "label")
train, test, valid = df_ml.randomSplit([0.6, 0.2, 0.2])

In [7]:
# index categorical features gender, last_level and last_state

stringIndexerGender = StringIndexer(inputCol="gender", outputCol="genderIndex", handleInvalid = 'skip')
stringIndexerLevel = StringIndexer(inputCol="last_level", outputCol="levelIndex", handleInvalid = 'skip')
stringIndexerState = StringIndexer(inputCol="last_state", outputCol="stateIndex", handleInvalid = 'skip')

In [8]:
# one hot encoding of categorical features
encoder = OneHotEncoder(inputCols=["genderIndex", "levelIndex", "stateIndex"],
                                       outputCols=["genderVec", "levelVec", "stateVec"],
                                handleInvalid = 'keep')

In [9]:
# create vector for features
features = ['genderVec', 'levelVec', 'stateVec', 'days_active', 'avg_songs', 'avg_events', 'thumbs_up', 'thumbs_down', 'addfriend']
assembler = VectorAssembler(inputCols=features, outputCol="features")

## 3.) Experimenting with models

#### i.) Random Forest

In [10]:
# initialize random forest classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, rf])

In [11]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

In [12]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

rf_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on train data is = 0.8162528356043888


In [13]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

rf_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.6985446985446986


#### ii.) Logistic Regression Classifier

In [14]:
# initialize logistic regression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, lr])

In [15]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

22/12/15 08:14:52 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/15 08:14:52 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [16]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

lr_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on train data is = 0.6650781202871447


In [17]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

lr_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.7260992335619201


#### iii.) Gradient Boosted Tree Classifier

In [18]:
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=3)

# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, gbt])

In [19]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

In [20]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

gbt_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on train data is = 1.0


In [21]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

gbt_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.7718182578863693


#### iv.) Naive Bayes

In [22]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, nb])

In [23]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

In [24]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

nb_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on train data is = 0.7928531322691907


In [25]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

nb_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.7283072546230441


#### v.) Comparing the Results

In [26]:
res = [{'Classifier': 'Random Forest', 'Train': rf_train, 'Test': rf_test},
       {'Classifier': 'Logistic Regression', 'Train': lr_train, 'Test': lr_test},
       {'Classifier': 'Gradient-boosted Tree', 'Train': gbt_train, 'Test': gbt_test},
       {'Classifier': 'Naive Bayes', 'Train': nb_train, 'Test': nb_test}]
results = pd.DataFrame(res)
results

Unnamed: 0,Classifier,Train,Test
0,Random Forest,0.816253,0.698545
1,Logistic Regression,0.665078,0.726099
2,Gradient-boosted Tree,1.0,0.771818
3,Naive Bayes,0.792853,0.728307


## 4.) Experimenting with further refinement of the data to improve results

#### Replacing any state that is not in the top 10 frequent states by value "Other" as the frequency of lower ranking states is extremely skewed introduing bias in the model

In [27]:
# find top states
top_states = df_ml.select('last_state').groupBy(df_ml.last_state).count().sort(col("count").desc()).limit(11).toPandas()
top_states_list = top_states['last_state'][1:].values.tolist()
top_states_list

['PA', 'TX', 'FL', 'WI', 'IL', 'NC', 'SC', 'AZ', 'CT', 'NH']

In [28]:
# change names of rare states to 'OTHER'
df_refined = df_ml.withColumn('last_state',when(df_ml.last_state.isin(top_states_list), df_ml.last_state).otherwise('OTHER'))

In [29]:
train, test, valid = df_refined.randomSplit([0.6, 0.2, 0.2])

stringIndexerGender = StringIndexer(inputCol="gender", outputCol="genderIndex", handleInvalid = 'skip')
stringIndexerLevel = StringIndexer(inputCol="last_level", outputCol="levelIndex", handleInvalid = 'skip')
stringIndexerState = StringIndexer(inputCol="last_state", outputCol="stateIndex", handleInvalid = 'skip')

In [30]:
encoder = OneHotEncoder(inputCols=["genderIndex", "levelIndex", "stateIndex"],
                                       outputCols=["genderVec", "levelVec", "stateVec"],
                                handleInvalid = 'keep')

In [31]:
features = ['genderVec', 'levelVec', 'stateVec', 'days_active', 'avg_songs', 'avg_events', 'thumbs_up', 'thumbs_down', 'addfriend']
assembler = VectorAssembler(inputCols=features, outputCol="rawFeatures")

#### Normalizing the numerical features to balance data scales

In [32]:
normalizer = Normalizer(inputCol="rawFeatures", outputCol="features", p=1.0)

#### i.) Random Forest

In [33]:
# initialize random forest classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, normalizer, rf])

In [34]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

In [35]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
rf_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())
                                                                            



Weighted F1 score on train data is = 0.9550523848910946


In [36]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
rf_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.627906976744186


#### ii.) Logistic Regression

In [37]:
# initialize logistic regression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, normalizer, lr])

In [38]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

In [39]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
lr_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())
                                                                            



Weighted F1 score on train data is = 0.6603797074385309


In [40]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
lr_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.6040226272784412


#### iii.) Gradient Boosted Trees

In [41]:
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=6)

# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, normalizer, gbt])

In [42]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

In [43]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
gbt_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on train data is = 0.9925516959333385


In [44]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
gbt_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.7270125223613595


#### iv.) Naive Bayes

In [45]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# assemble pipeline
pipeline = Pipeline(stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, normalizer, nb])

In [46]:
model = pipeline.fit(train)
pred_train = model.transform(train)
pred_test = model.transform(test)

In [47]:
predictionAndLabels = pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
nb_train = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on train data is = 0.6603797074385309


In [48]:
predictionAndLabels = pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
nb_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.6040226272784412


In [49]:
res = [{'Classifier': 'Random Forest', 'Train': rf_train, 'Test': rf_test},
       {'Classifier': 'Logistic Regression', 'Train': lr_train, 'Test': lr_test},
       {'Classifier': 'Gradient-boosted Tree', 'Train': gbt_train, 'Test': gbt_test},
       {'Classifier': 'Naive Bayes', 'Train': nb_train, 'Test': nb_test}]
results = pd.DataFrame(res)
results

Unnamed: 0,Classifier,Train,Test
0,Random Forest,0.955052,0.627907
1,Logistic Regression,0.66038,0.604023
2,Gradient-boosted Tree,0.992552,0.727013
3,Naive Bayes,0.66038,0.604023


### At this stage Random Forest seems to give the best generalized results across our experiments so we will try to further tune it's hyperparameters

In [50]:
rf = RandomForestClassifier(labelCol="label", featuresCol="rawFeatures")
stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, rf]
pipeline = Pipeline(stages = stages)

In [51]:
# initialize the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 50, 100]) \
    .addGrid(rf.maxDepth, [2, 3, 5]) \
    .addGrid(rf.impurity, ['entropy', 'gini'])\
    .addGrid(rf.featureSubsetStrategy, ['auto','sqrt', 'log2'])\
    .build()

In [52]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)

In [53]:
# perform cross validation
cvModel = crossval.fit(train)
bestModel = cvModel.bestModel

In [54]:
# get the results of cross validation

# get parameters
params = [{p.name: v for p, v in m.items()} for m in cvModel.getEstimatorParamMaps()]

# convert validation results to pandas dataframe
validation_results = pd.DataFrame.from_dict([
    {cvModel.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params, cvModel.avgMetrics)
])

In [55]:
validation_results

Unnamed: 0,f1,numTrees,maxDepth,impurity,featureSubsetStrategy
0,0.707044,10,2,entropy,auto
1,0.707044,10,2,entropy,sqrt
2,0.707044,10,2,entropy,log2
3,0.765105,10,2,gini,auto
4,0.765105,10,2,gini,sqrt
5,0.765105,10,2,gini,log2
6,0.801666,10,3,entropy,auto
7,0.801666,10,3,entropy,sqrt
8,0.801666,10,3,entropy,log2
9,0.761805,10,3,gini,auto


In [56]:
# initialize RF with best parameters
rf_tuned = RandomForestClassifier(labelCol="label", featuresCol="rawFeatures", numTrees=10, maxDepth = 5,\
                                 impurity = 'entropy', featureSubsetStrategy = 'sqrt')

# assemble pipeline
stages = [stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, rf_tuned]
pipeline_tuned = Pipeline(stages = stages)

# fit model
pipeline_tuned.fit(train)

# get predictions for train, test, and validation datasets
best_pred_train = bestModel.transform(train)
best_pred_test = bestModel.transform(test)
best_pred_valid = bestModel.transform(valid)

In [57]:
predictionAndLabels = best_pred_train.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
nb_test = metrics.weightedFMeasure()
print("Weighted F1 score on train data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on train data is = 0.9465653942158079


In [58]:
predictionAndLabels = best_pred_test.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
nb_test = metrics.weightedFMeasure()
print("Weighted F1 score on test data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on test data is = 0.7225913621262459


In [59]:
predictionAndLabels = best_pred_valid.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
nb_test = metrics.weightedFMeasure()
print("Weighted F1 score on validation data is = %s" % metrics.weightedFMeasure())



Weighted F1 score on validation data is = 0.8907442319777359
