In [1]:
# Starter code
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, udf, isnan, count, when, desc, sort_array, asc, avg, lag, floor
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, DateType
from pyspark.sql.functions import sum as Fsum
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler #used because won't distort binary vars
from pyspark.sql.types import DoubleType
import datetime

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import numpy as np

# Create spark session
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

# Read in full sparkify dataset
event_data = "s3n://dsnd-sparkify/sparkify_event_data.json"
data = spark.read.json(event_data)
data.head()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1548207771803_0001,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
Row(artist=u'Popol Vuh', auth=u'Logged In', firstName=u'Shlok', gender=u'M', itemInSession=278, lastName=u'Johnson', length=524.32934, level=u'paid', location=u'Dallas-Fort Worth-Arlington, TX', method=u'PUT', page=u'NextSong', registration=1533734541000, sessionId=22683, song=u'Ich mache einen Spiegel - Dream Part 4', status=200, ts=1538352001000, userAgent=u'"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId=u'1749042')

In [2]:
#define necessary functions
def feature_engineering(df):
    '''
    Create necessary features to use machine learning algorithms.
    First loads data set from file
    
    Resulting DF Strucutre:
    
    root
     |-- userId: string
     |-- downgraded: long
     |-- cancelled: long
     |-- visited_cancel: long
     |-- visited_downgrade: long
     |-- dailyHelpVisits: double
     |-- dailyErrors: double
     |-- free: integer
     |-- paid: integer
     |-- avgThumbsUp: double
     |-- avgThumbsDOwn: double
     |-- numFriends: long
     |-- avgSongsTillHome: double
     |-- avgTimeSkipped: double
     |-- skipRate: double
    
    Inputs
        filepath (str) - path to json dataset on file
        
    Outputs
        data - engineered dataset
    '''
    df.persist() #maintain data in memory to speed up feature engineering process
    #dataframe of user ids to merge onto
    users = df.where((df.userId != "") | (df.sessionId != ""))\
        .select('userId').dropDuplicates()
    df = df.where((df.userId != "") | (df.sessionId != "")) #clean dataframe
    
    #define custom functions
    churn = udf(lambda x: int(x=="Cancellation Confirmation"), IntegerType())
    downgrade_churn = udf(lambda x: int(x=="Submit Downgrade"), IntegerType())
    visited_downgrade = udf(lambda x: int(x=='Downgrade'), IntegerType())
    visited_cancel = udf(lambda x: int(x=='Cancel'), IntegerType())
    song = udf(lambda x: int(x=='NextSong'), IntegerType())
    days = lambda i: i * 86400 
    get_day = udf(lambda x: datetime.datetime.fromtimestamp(x/1000), DateType())
    skipped = udf(lambda x: int(x!=0), IntegerType())
    free = udf(lambda x: int(x=='free'), IntegerType())
    paid = udf(lambda x: int(x=='paid'), IntegerType())
    home_visit=udf(lambda x : int(x=='Home'), IntegerType())
    
    #define windows
    windowval = Window.partitionBy("userId").orderBy(desc("ts")).rangeBetween(Window.unboundedPreceding, 0)
    session = Window.partitionBy("userId", "sessionId").orderBy(desc("ts"))
    daywindow = Window.partitionBy('userId', 'date').orderBy(desc('ts'))\
        .rangeBetween(Window.unboundedPreceding, 0)

    #count average daily occurences of thumbs up per user
    avgThumbsUp = df.filter(df.page=='Thumbs Up')\
        .select('userId', 'page', 'ts')\
        .withColumn('date', get_day(col('ts')))\
        .groupBy('userId', 'date').agg({'page':'count'}).groupBy('userId')\
        .mean().withColumnRenamed('avg(count(page))', 'avgThumbsUp')

    #count average daily occurences of thumbs up per user
    avgThumbsDown = df.filter(df.page=='Thumbs Down')\
        .select('userId', 'page', 'ts')\
        .withColumn('date', get_day(col('ts')))\
        .groupBy('userId', 'date').agg({'page':'count'})\
        .groupBy('userId').mean()\
        .withColumnRenamed('avg(count(page))', 'avgThumbsDown')

    #count the number of friends each user has
    numFriends = df.filter(df.page=='Add Friend')\
        .select('userId', 'page')\
        .groupBy('userId').count().withColumnRenamed('count', 'numFriends')
    
    '''
    Calculate average time of song skipped
    process for calculating skipping variables

    1. dont include thumbs up and down pages because that usually occurs 
        while playing and does not change song
    2. create variable for if action is song
    3. check if next action is song - this will check to see if someone is 
        skipping song or just leaving page
    4. get the difference in timestamp for next action song playing
    5. subtract the difference in timestamp from song length to see 
        how much of song was skipped
    6. get descriptive stats
    '''
    skipping = df.select('userId', 'page', 'ts', 'length', 'sessionId', 'itemInSession')\
        .where((df.page != 'Thumbs Up') & (df.page != 'Thumbs Down'))\
        .withColumn('song', song('page')).orderBy('userId', 'sessionId', 'itemInSession')\
        .withColumn('nextActSong', lag(col('song')).over(session))\
        .withColumn('tsDiff', (lag('ts').over(session)-col('ts'))/1000)\
        .withColumn('timeSkipped', (floor('length')-col('tsDiff')))\
        .withColumn('roundedLength', floor('length'))\
        .where((col('song')==1) & ((col('nextActSong')!=0)&(col('timeSkipped')>=0)))\
        .withColumn('skipped', skipped('timeSkipped'))\
        .select('userId', 'timeSkipped', 'skipped', 'length', 'ts', 'tsDiff')\
        .groupBy('userId').agg({'skipped':'avg', 'timeSkipped':'avg'})\
        .withColumnRenamed('avg(skipped)', 'skipRate')\
        .withColumnRenamed('avg(timeSkipped)', 'avgTimeSkipped')
    
    #avg daily visits to help site
    dailyHelpVisit = df.filter(df.page=='Help')\
        .select('userId', 'page', 'ts', 'length')\
        .withColumn('date', get_day(col('ts')))\
        .groupBy('userId', 'date').agg({'page':'count'})\
        .groupBy('userId').mean()\
         .withColumnRenamed('avg(count(page))', 'dailyHelpVisits')

    #count average daily errors occured
    dailyErrors = df.filter(df.page=='Error')\
        .select('userId', 'page', 'ts', 'length')\
        .withColumn('date', get_day(col('ts')))\
        .groupBy('userId', 'date').agg({'page':'count'})\
        .groupBy('userId').mean()\
        .withColumnRenamed('avg(count(page))', 'dailyErrors')
    
    #whether a user has downgraded
    churn = df.withColumn("downgraded", downgrade_churn("page"))\
        .withColumn("cancelled", churn("page"))\
        .withColumn('visited_cancel', visited_cancel('page'))\
        .withColumn('visited_downgrade', visited_downgrade('page'))\
        .select(['userId', 'downgraded', 'cancelled', 'visited_cancel', 'visited_downgrade'])\
        .groupBy('userId').sum()\
        .withColumnRenamed('sum(downgraded)', 'downgraded')\
        .withColumnRenamed('sum(cancelled)', 'cancelled')\
        .withColumnRenamed('sum(visited_cancel)', 'visited_cancel')\
        .withColumnRenamed('sum(visited_downgrade)', 'visited_downgrade')

    #assign the user a current level (free, paid) by dropping all duplicate values, and keeping most recent vals
    user_level = df.select('userId', 'level','ts')\
        .where((df.level=='free')|(df.level=='paid'))\
        .orderBy(desc('ts')).drop('ts').dropDuplicates()\
        .withColumn('free', free('level'))\
        .withColumn('paid', paid('level')).drop('level')

    #mark each song between home visit with a 1
    cusum = df.filter((df.page == 'NextSong') | (df.page == 'Home')) \
        .select('userID', 'page', 'ts') \
        .withColumn('homevisit', home_visit(col('page'))) \
        .withColumn('songPeriod', Fsum('homevisit').over(windowval))\
    
    #calculate average number of songs played between each home visit
    avgSongsTillHome = cusum.filter((cusum.page=='NextSong'))\
        .groupBy('userId', 'songPeriod')\
        .agg({'songPeriod':'count'}).drop('songPeriod')\
        .groupby('userId').mean()\
        .withColumnRenamed('avg(count(songPeriod))', 'avgSongsTillHome')
    
    #combine user id on 
    df = users.join(churn, on='userId')\
        .join(dailyHelpVisit, on='userId')\
        .join(dailyErrors, on='userId')\
        .join(user_level, on='userId')\
        .join(avgThumbsUp, on='userId')\
        .join(avgThumbsDown, on='userId')\
        .join(numFriends, on='userId')\
        .join(avgSongsTillHome, on='userId')\
        .join(skipping, on='userId')
    return df

def feature_scaling(df):
    '''
    Function takes care of scaling inputs into the model to between [0,1]
    Otherwise if the values weren't scaled then the feature with highest values would dominate the training.
    
    Input
        df (Spark DataFrame)
        
    Output
        scaled_df (Spark DataFrame)
    '''
    df.persist() #keep df in memory to speed up computation
    
    feature_cols = df.drop('userId', 'cancelled').columns
    assembler = VectorAssembler(inputCols=feature_cols,\
                                outputCol='feature_vec')
    
    #pyspark.ml expects target column to be names: 'labelCol', w/ type: Double
    df = df.withColumn("label", df["cancelled"].cast(DoubleType()))
    
    #pyspark default name for features vector column: 'featuresCol'
    minmaxscaler = MinMaxScaler(inputCol="feature_vec", outputCol="features")
    
    df = assembler.transform(df)
    minmaxscaler_model = minmaxscaler.fit(df)
    scaled_df = minmaxscaler_model.transform(df)
    return scaled_df

def custom_evaluation(pred, model_name):
    '''
    Perform custom evaluation of predictions
    
    1.inspect with PySpark.ML evaluator (will use for pipeline)
    2. use RDD-API; PySpark.MLLib to get metrics based on predictions 
    3. display confusion matrix
    
    Inspiration from: https://chih-ling-hsu.github.io/2018/09/17/spark-mllib
    https://spark.apache.org/docs/2.2.0/mllib-evaluation-metrics.html
    https://stackoverflow.com/questions/35572000/how-can-i-plot-a-confusion-matrix
    
    Inputs
        preds(PySpark.ml.DataFrame) - predictions from model
    '''
    #want to evaluate binary class, auc_pr is best for imbalanced classes
    tn_sum = pred.filter((pred.label == 0)&(pred.prediction==0)).count() #true negative
    fn_sum = pred.filter((pred.label == 1)&(pred.prediction==0)).count() #false negative
    fp_sum = pred.filter((pred.label == 0)&(pred.prediction==1)).count() #false positive
    tp_sum = pred.filter((pred.label == 1)&(pred.prediction==1)).count() #true positive

    print("{} \n | tn:{}| fn:{}| fp:{}| tp:{}".format(model_name, tn_sum, fn_sum, fp_sum, tp_sum))

VBox()

In [3]:
#prepare data for ML
df = feature_engineering(data)

VBox()

In [4]:
#scale features for ML algo
df_scaled = feature_scaling(df)

VBox()

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 50604)
----------------------------------------
Traceback (most recent call last):
  File "/usr/lib64/python2.7/SocketServer.py", line 290, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib64/python2.7/SocketServer.py", line 318, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib64/python2.7/SocketServer.py", line 331, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib64/python2.7/SocketServer.py", line 652, in __init__
    self.handle()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 266, in handle
    poll(authenticate_and_accum_updates)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 254, in authe

In [5]:
df_scaled.collect()[0]

VBox()

Row(userId=u'1000280', downgraded=1, cancelled=1, visited_cancel=1, visited_downgrade=3, dailyHelpVisits=1.6, dailyErrors=1.0, free=1, paid=0, avgThumbsUp=3.533333333333333, avgThumbsDown=2.0625, numFriends=14, avgSongsTillHome=24.333333333333332, avgTimeSkipped=0.0, skipRate=0.0, label=1.0, feature_vec=DenseVector([1.0, 1.0, 3.0, 1.6, 1.0, 1.0, 0.0, 3.5333, 2.0625, 14.0, 24.3333, 0.0, 0.0]), features=DenseVector([0.1429, 1.0, 0.0226, 0.12, 0.0, 1.0, 0.0, 0.1378, 0.1678, 0.0588, 0.2301, 0.0, 0.0]))

In [6]:
#split data for training
train, rest = df_scaled.randomSplit([0.85, 0.15], seed=42)
validation, test = rest.randomSplit([0.5,0.5], seed=42)

VBox()

In [7]:
train.persist() #reduce computational time

VBox()

DataFrame[userId: string, downgraded: bigint, cancelled: bigint, visited_cancel: bigint, visited_downgrade: bigint, dailyHelpVisits: double, dailyErrors: double, free: int, paid: int, avgThumbsUp: double, avgThumbsDown: double, numFriends: bigint, avgSongsTillHome: double, avgTimeSkipped: double, skipRate: double, label: double, feature_vec: vector, features: vector]

In [24]:
import time

VBox()

In [25]:
#random forest classifier model
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=10)
start=time.time()
rf_model = rf.fit(train)
rf_preds = rf_model.transform(validation)
end=time.time()
print("duration: ", end-start)
custom_evaluation(rf_preds, 'Random Forest')

VBox()

('duration: ', 7.361753940582275)
Random Forest 
 | tn:1043| fn:0| fp:0| tp:279

In [26]:
#gradient boosted trees (ie ada boost)
from pyspark.ml.classification import GBTClassifier
gbtrees = GBTClassifier(maxIter=10)
start=time.time()
gbtree_model = gbtrees.fit(train)
gbtree_preds = gbtree_model.transform(validation)
end=time.time()
print("duration: ", end-start)
custom_evaluation(gbtree_preds, 'Gradient Boosted Trees')

VBox()

Exception in thread cell_monitor-26:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/site-packages/awseditorssparkmonitoringwidget-1.0-py3.6.egg/awseditorssparkmonitoringwidget/cellmonitor.py", line 178, in cell_monitor
    job_binned_stages[job_id][stage_id] = all_stages[stage_id]
KeyError: 4538



('duration: ', 52.26982617378235)
Gradient Boosted Trees 
 | tn:1043| fn:0| fp:0| tp:279

In [27]:
#SVM: https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-support-vector-machine

from pyspark.ml.classification import LinearSVC
svm = LinearSVC(maxIter=10, regParam=0.1)
start=time.time()
svm_model=svm.fit(train)
svm_preds=svm_model.transform(validation)
end=time.time()
print("duration: ", end-start)
custom_evaluation(svm_preds, 'Support Vector Machine')

VBox()

('duration: ', 20.398164987564087)
Support Vector Machine 
 | tn:1043| fn:0| fp:0| tp:279

In [11]:
#logistic regression model
from pyspark.ml.classification import LogisticRegression
logReg = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = logReg.fit(train)
lr_preds = lrModel.transform(validation)
custom_evaluation(lr_preds, 'Logistic Regression')

VBox()

Logistic Regression 
 | tn:1043| fn:279| fp:0| tp:0

#### Compare Results to make sure they are accurate

In [12]:
#visual check for predictions
for x in [svm_preds, lr_preds, gbtree_preds, rf_preds]:
    x.select('features', 'rawPrediction', 'prediction', 'label').show(20)

VBox()

+--------------------+--------------------+----------+-----+
|            features|       rawPrediction|prediction|label|
+--------------------+--------------------+----------+-----+
|[0.0,0.0,0.0,0.06...|[1.29454586908108...|       0.0|  0.0|
|[0.0,0.0,0.0,0.1,...|[0.91650298266328...|       0.0|  0.0|
|[0.0,0.0,0.120300...|[1.26981895598087...|       0.0|  0.0|
|[0.0,0.0,0.022556...|[1.06079024054081...|       0.0|  0.0|
|[0.0,0.0,0.120300...|[1.30126183663906...|       0.0|  0.0|
|[0.14285714285714...|[1.66035588375266...|       0.0|  0.0|
|[0.14285714285714...|[1.00878880062720...|       0.0|  0.0|
|[0.0,0.0,0.037593...|[1.24246986138686...|       0.0|  0.0|
|[0.14285714285714...|[1.52749153383729...|       0.0|  0.0|
|[0.14285714285714...|[1.51957700662642...|       0.0|  0.0|
|[0.14285714285714...|[1.31041568085742...|       0.0|  0.0|
|[0.14285714285714...|[1.76545507949694...|       0.0|  0.0|
|[0.0,0.0,0.052631...|[1.05925139204592...|       0.0|  0.0|
|[0.28571428571428...|[1

In [13]:
#PCA
#https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.PCA
from pyspark.ml.feature import PCA


#first create PCA with all the feeatures kept- 13 features
pca_full = PCA(k=13, inputCol="features", outputCol="pcaFeatures")
pca_model_full = pca_full.fit(df_scaled)
pca_model_full.explainedVariance

VBox()

DenseVector([0.6684, 0.2304, 0.0372, 0.018, 0.0149, 0.01, 0.008, 0.0075, 0.0032, 0.0019, 0.0006, 0.0, 0.0])

The explained variance vector shows that 97.69% of the variance in the dataset can be explained by the first 6 features. We really do not need more features than that from PCA.

In [14]:
pca_final = PCA(k=6, inputCol="features", outputCol="pcaFeatures")
pca_model = pca_final.fit(df_scaled)
df_scaled_pca = pca_model.transform(df_scaled)

VBox()

In [15]:
#split data for training
train_pca, rest_pca = df_scaled_pca.randomSplit([0.85, 0.15], seed=42)
validation_pca, test_pca = rest_pca.randomSplit([0.5,0.5], seed=42)

VBox()

In [28]:
rf_pca = RandomForestClassifier(featuresCol='pcaFeatures', numTrees=10)

VBox()

In [29]:
start=time.time()
rf_pca_model=rf_pca.fit(train_pca)
rf_preds_pca=rf_pca_model.transform(validation_pca)
end=time.time()
print("duration: ",end-start)
custom_evaluation(rf_preds_pca, 'Random Forest Classifier PCA(k=6)')

VBox()

('duration: ', 9.36722993850708)
Random Forest Classifier PCA(k=6) 
 | tn:1043| fn:0| fp:0| tp:279

In [42]:
#test model on testing set
rf_test_preds = rf_pca_model.transform(test_pca)
custom_evaluation(rf_test_preds, 'Random Forest Classifier Test')

VBox()

Random Forest Classifier Test 
 | tn:1144| fn:0| fp:0| tp:327

The test is fantastic!

Investigate parameters of final model

In [40]:
print(rf_pca_model.explainParams())

VBox()

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]. (default: auto)
featuresCol: features column name (default: features, current: pcaFeatures)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini (default: gini)
labelCol: label column name (default: label)
maxBins: Max number of bins for discretizing continuous features.  Must be >=2 and >= number of

In [41]:
import pprint
pp = pprint.PrettyPrinter(indent=0)
pp.pprint(rf_pca_model.extractParamMap())

VBox()

{Param(parent=u'RandomForestClassifier_690037145a18', name='featuresCol', doc='features column name'): 'pcaFeatures',
Param(parent=u'RandomForestClassifier_690037145a18', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',
Param(parent=u'RandomForestClassifier_690037145a18', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
Param(parent=u'RandomForestClassifier_690037145a18', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
Param(parent=u'RandomForestClassifier_690037145a1