In [1]:
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession.builder.master("local[*]").appName("sparkify").getOrCreate()
spark

In [6]:
data = spark.read.json("../data/mini_sparkify_event_data.json")
data.limit(10).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Martha Tilston,Logged In,Colin,M,50,Freeman,277.89016,paid,"Bakersfield, CA",PUT,NextSong,1538173362000,29,Rockpools,200,1538352117000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,30
1,Five Iron Frenzy,Logged In,Micah,M,79,Long,236.09424,free,"Boston-Cambridge-Newton, MA-NH",PUT,NextSong,1538331630000,8,Canada,200,1538352180000,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",9
2,Adam Lambert,Logged In,Colin,M,51,Freeman,282.8273,paid,"Bakersfield, CA",PUT,NextSong,1538173362000,29,Time For Miracles,200,1538352394000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,30
3,Enigma,Logged In,Micah,M,80,Long,262.71302,free,"Boston-Cambridge-Newton, MA-NH",PUT,NextSong,1538331630000,8,Knocking On Forbidden Doors,200,1538352416000,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",9
4,Daft Punk,Logged In,Colin,M,52,Freeman,223.60771,paid,"Bakersfield, CA",PUT,NextSong,1538173362000,29,Harder Better Faster Stronger,200,1538352676000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,30
5,The All-American Rejects,Logged In,Micah,M,81,Long,208.29995,free,"Boston-Cambridge-Newton, MA-NH",PUT,NextSong,1538331630000,8,Don't Leave Me,200,1538352678000,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",9
6,The Velvet Underground / Nico,Logged In,Micah,M,82,Long,260.46649,free,"Boston-Cambridge-Newton, MA-NH",PUT,NextSong,1538331630000,8,Run Run Run,200,1538352886000,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",9
7,Starflyer 59,Logged In,Colin,M,53,Freeman,185.44281,paid,"Bakersfield, CA",PUT,NextSong,1538173362000,29,Passengers (Old Album Version),200,1538352899000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,30
8,,Logged In,Colin,M,54,Freeman,,paid,"Bakersfield, CA",PUT,Add to Playlist,1538173362000,29,,200,1538352905000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,30
9,Frumpies,Logged In,Colin,M,55,Freeman,134.47791,paid,"Bakersfield, CA",PUT,NextSong,1538173362000,29,Fuck Kitty,200,1538353084000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,30


In [8]:
data.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [9]:
data.head()

Row(artist='Martha Tilston', auth='Logged In', firstName='Colin', gender='M', itemInSession=50, lastName='Freeman', length=277.89016, level='paid', location='Bakersfield, CA', method='PUT', page='NextSong', registration=1538173362000, sessionId=29, song='Rockpools', status=200, ts=1538352117000, userAgent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0', userId='30')

In [10]:
# timestamp
data.describe('ts').show()

+-------+--------------------+
|summary|                  ts|
+-------+--------------------+
|  count|              286500|
|   mean|1.540956889810471...|
| stddev|1.5075439608187113E9|
|    min|       1538352117000|
|    max|       1543799476000|
+-------+--------------------+



In [11]:
data.select("page").dropDuplicates().show()


+--------------------+
|                page|
+--------------------+
|              Cancel|
|    Submit Downgrade|
|         Thumbs Down|
|                Home|
|           Downgrade|
|         Roll Advert|
|              Logout|
|       Save Settings|
|Cancellation Conf...|
|               About|
| Submit Registration|
|            Settings|
|               Login|
|            Register|
|     Add to Playlist|
|          Add Friend|
|            NextSong|
|           Thumbs Up|
|                Help|
|             Upgrade|
+--------------------+
only showing top 20 rows



### Churn

From the list above, `Cancellation Confirmation` and `Submit Downgrade` are probably Churn

In [23]:
from pyspark.sql.functions import UserDefinedFunction as udf
from pyspark.sql.types import IntegerType


In [24]:
# Define a flag function
flag_cancelation_event = udf(lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType())
# apply to the dataframe
data = data.withColumn("churn", flag_cancelation_event("page"))

In [26]:
from pyspark.sql.window import Window


In [28]:
#Define window bounds
windowval = Window.partitionBy("userId") \
    .rangeBetween(Window.unboundedPreceding, 
                  Window.unboundedFollowing)

In [34]:
from pyspark.sql.functions import sum as Fsum


In [35]:
# Applying the window
data = data.withColumn("churn", Fsum("churn").over(windowval))
print("The total number of churners are:", data.agg(Fsum("churn")).collect()[0][0])

The total number of churners are: 44864


In [37]:
data.show(vertical=True, n=1)

-RECORD 0-----------------------------
 artist        | Sleeping With Sirens 
 auth          | Logged In            
 firstName     | Darianna             
 gender        | F                    
 itemInSession | 0                    
 lastName      | Carpenter            
 length        | 202.97098            
 level         | free                 
 location      | Bridgeport-Stamfo... 
 method        | PUT                  
 page          | NextSong             
 registration  | 1538016340000        
 sessionId     | 31                   
 song          | Captain Tyin Knot... 
 status        | 200                  
 ts            | 1539003534000        
 userAgent     | "Mozilla/5.0 (iPh... 
 userId        | 100010               
 churn         | 0                    
only showing top 1 row



In [39]:
stat_df = spark.createDataFrame(data.dropDuplicates(['userId']).collect())
stat_df1 = stat_df[['gender', 'churn']]
print('The avg churn rate of females is:', stat_df1.groupby(['gender']).mean().collect()[0][1]*100)
print('The avg churn rate of males is:', stat_df1.groupby(['gender']).mean().collect()[1][1]*100)

The avg churn rate of females is: 19.230769230769234
The avg churn rate of males is: 0.0


In [42]:
# churn for free and paid
stat_df1 = stat_df[['level', 'churn']]
print("proportion of users that chruned from free subscirption", stat_df1.groupBy(['level']).mean().collect()[0][1]*100)
print("proportion of users that chruned from paid subscirption",stat_df1.groupBy(['level']).mean().collect()[1][1]*100)

proportion of users that chruned from free subscirption 24.719101123595504
proportion of users that chruned from paid subscirption 16.666666666666664


In [46]:
from pyspark.sql.functions import col, split, mean, first, lit


In [47]:
stat_df1 = stat_df[['location', 'churn']]
stat_df1 = stat_df1.withColumn('state', split(stat_df['location'], ',').getItem(1))
stat_df1 = stat_df1.drop('location')
print("Viewing top 10 states to churn:\n")
stat_df1.groupBy(["state"]).sum().filter("sum(churn)> 0").orderBy("sum(churn)", ascending = False).show(10)

Viewing top 10 states to churn:

+---------+----------+
|    state|sum(churn)|
+---------+----------+
|       CA|         6|
| NY-NJ-PA|         5|
|       MI|         3|
|       FL|         3|
|       TX|         3|
|       MS|         2|
|       WA|         2|
|       AZ|         2|
|       OH|         2|
|       AL|         2|
+---------+----------+
only showing top 10 rows



In [48]:
feat_1 = data \
    .select('userId','registration','ts') \
    .withColumn('lifetime',(data.ts-data.registration)) \
    .groupBy('userId') \
    .agg({'lifetime':'max'}) \
    .withColumnRenamed('max(lifetime)','lifetime') \
    .select('userId', (col('lifetime')/1000/3600/24).alias('lifetime'))
feat_1.describe().show()

+-------+------------------+-------------------+
|summary|            userId|           lifetime|
+-------+------------------+-------------------+
|  count|               226|                225|
|   mean|65391.013333333336|   79.8456834876543|
| stddev|105396.47791907164|  37.66147001861254|
|    min|                  |0.31372685185185184|
|    max|                99|  256.3776736111111|
+-------+------------------+-------------------+



In [49]:
feat_1.show()

+------+------------------+
|userId|          lifetime|
+------+------------------+
|100010|  55.6436574074074|
|200002| 70.07462962962963|
|   125| 71.31688657407408|
|    51|19.455844907407407|
|   124|131.55591435185184|
|     7| 72.77818287037037|
|    54|110.75168981481481|
|    15|56.513576388888886|
|   155|23.556018518518517|
|   132|  66.8891087962963|
|   154|23.872037037037035|
|100014| 85.08340277777778|
|   101|53.965937499999995|
|    11|124.47825231481481|
|   138| 66.62668981481481|
|300017| 74.35851851851852|
|    29|60.104050925925925|
|    69| 71.42444444444445|
|100021| 64.73886574074074|
|    42| 67.11364583333334|
+------+------------------+
only showing top 20 rows



In [12]:
data.filter(data.page=="Cancel").limit(10).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Adriel,M,103,Mendoza,,paid,"Kansas City, MO-KS",PUT,Cancel,1535623466000,514,,307,1538943740000,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",18
1,,Logged In,Diego,M,55,Mckee,,paid,"Phoenix-Mesa-Scottsdale, AZ",PUT,Cancel,1537167593000,540,,307,1539033031000,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",32
2,,Logged In,Mason,M,9,Hart,,free,"Corpus Christi, TX",PUT,Cancel,1533157139000,174,,307,1539318918000,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",125
3,,Logged In,Alexander,M,331,Garcia,,paid,"Indianapolis-Carmel-Anderson, IN",PUT,Cancel,1536817381000,508,,307,1539375378000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,105
4,,Logged In,Kayla,F,272,Johnson,,paid,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,Cancel,1538333829000,797,,307,1539465451000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) G...,17
5,,Logged In,Molly,F,28,Harrison,,free,"Virginia Beach-Norfolk-Newport News, VA-NC",PUT,Cancel,1534255113000,843,,307,1539588790000,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",143
6,,Logged In,Alex,M,144,Hogan,,paid,"Denver-Aurora-Lakewood, CO",PUT,Cancel,1535066380000,842,,307,1539728810000,Mozilla/5.0 (Windows NT 6.2; WOW64; rv:31.0) G...,101
7,,Logged In,Davis,M,33,Wang,,paid,"Flint, MI",PUT,Cancel,1538289776000,802,,307,1539736069000,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",129
8,,Logged In,Nikolas,M,286,Olsen,,paid,"Oxnard-Thousand Oaks-Ventura, CA",PUT,Cancel,1528403713000,881,,307,1539759739000,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:31....,121
9,,Logged In,Ethan,M,175,Johnson,,paid,"Lexington-Fayette, KY",PUT,Cancel,1538080987000,934,,307,1539761830000,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",51


In [13]:
data.select("userAgent").dropDuplicates().show()


+--------------------+
|           userAgent|
+--------------------+
|"Mozilla/5.0 (Mac...|
|"Mozilla/5.0 (Win...|
|Mozilla/5.0 (X11;...|
|"Mozilla/5.0 (Mac...|
|"Mozilla/5.0 (Mac...|
|Mozilla/5.0 (Maci...|
|Mozilla/5.0 (Wind...|
|Mozilla/5.0 (Wind...|
|Mozilla/5.0 (comp...|
|"Mozilla/5.0 (Win...|
|Mozilla/5.0 (Maci...|
|"Mozilla/5.0 (Win...|
|                null|
|"Mozilla/5.0 (iPh...|
|"Mozilla/5.0 (Win...|
|Mozilla/5.0 (Wind...|
|Mozilla/5.0 (comp...|
|Mozilla/5.0 (comp...|
|"Mozilla/5.0 (Mac...|
|"Mozilla/5.0 (Mac...|
+--------------------+
only showing top 20 rows



In [15]:
from pyspark.sql.functions import isnan, count, when, col, desc, col, sort_array, asc, avg


In [17]:
data.select([count(when(isnan(c), c)).alias(c) for c in data.columns]).show(vertical = True)

-RECORD 0------------
 artist        | 0   
 auth          | 0   
 firstName     | 0   
 gender        | 0   
 itemInSession | 0   
 lastName      | 0   
 length        | 0   
 level         | 0   
 location      | 0   
 method        | 0   
 page          | 0   
 registration  | 0   
 sessionId     | 0   
 song          | 0   
 status        | 0   
 ts            | 0   
 userAgent     | 0   
 userId        | 0   

