In [90]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
import datetime

In [31]:
spark = SparkSession \
                    .builder \
                    .master("local") \
                    .appName("SparkPractice3") \
                    .getOrCreate()

In [32]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.driver.host', 'host.docker.internal'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.app.id', 'local-1571642713077'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.port', '56394'),
 ('spark.submit.deployMode', 'client'),
 ('spark.eventLog.dir', 'file:/tmp/spark-events'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'SparkPractice3')]

In [33]:
spark

In [34]:
path = r".\songs_events.json"

In [35]:
df = spark.read.json(path)

In [36]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [37]:
df.describe()

DataFrame[summary: string, artist: string, auth: string, firstName: string, gender: string, itemInSession: string, lastName: string, length: string, level: string, location: string, method: string, page: string, registration: string, sessionId: string, song: string, status: string, ts: string, userAgent: string, userId: string]

In [38]:
df.show(n=1)

+------------+---------+---------+------+-------------+--------+--------+-----+--------------------+------+--------+-----------------+---------+---------+------+-------------+--------------------+------+
|      artist|     auth|firstName|gender|itemInSession|lastName|  length|level|            location|method|    page|     registration|sessionId|     song|status|           ts|           userAgent|userId|
+------------+---------+---------+------+-------------+--------+--------+-----+--------------------+------+--------+-----------------+---------+---------+------+-------------+--------------------+------+
|Miami Horror|Logged In|     Kate|     F|           88| Harrell|250.8273| paid|Lansing-East Lans...|   PUT|NextSong|1.540472624796E12|      293|Sometimes|   200|1541548876796|"Mozilla/5.0 (X11...|    97|
+------------+---------+---------+------+-------------+--------+--------+-----+--------------------+------+--------+-----------------+---------+---------+------+-------------+---------

In [39]:
df.take(1)

[Row(artist='Miami Horror', auth='Logged In', firstName='Kate', gender='F', itemInSession=88, lastName='Harrell', length=250.8273, level='paid', location='Lansing-East Lansing, MI', method='PUT', page='NextSong', registration=1540472624796.0, sessionId=293, song='Sometimes', status=200, ts=1541548876796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"', userId='97')]

In [40]:
df.describe("artist").show()

+-------+---------+
|summary|   artist|
+-------+---------+
|  count|      972|
|   mean|     null|
| stddev|     null|
|    min|      !!!|
|    max|zebrahead|
+-------+---------+



In [41]:
df.describe("sessionID").show()

+-------+------------------+
|summary|         sessionID|
+-------+------------------+
|  count|              1117|
|   mean|488.28558639212173|
| stddev|157.11506381542455|
|    min|                 8|
|    max|               674|
+-------+------------------+



In [42]:
df.count()

1117

In [43]:
df.select("page").dropDuplicates().sort("page").show()

+-------------+
|         page|
+-------------+
|        About|
|    Downgrade|
|         Help|
|         Home|
|        Login|
|       Logout|
|     NextSong|
|Save Settings|
|     Settings|
|      Upgrade|
+-------------+



In [75]:
df.select(["userId", "firstname", "page", "level", "song"]).where(df.userId == "80").collect()

[Row(userId='80', firstname='Tegan', page='Home', level='free', song=None),
 Row(userId='80', firstname='Tegan', page='NextSong', level='free', song="Baby I'm Yours"),
 Row(userId='80', firstname='Tegan', page='Home', level='paid', song=None),
 Row(userId='80', firstname='Tegan', page='NextSong', level='paid', song='Best Of Both Worlds (Remastered Album Version)'),
 Row(userId='80', firstname='Tegan', page='NextSong', level='paid', song='Call Me If You Need Me'),
 Row(userId='80', firstname='Tegan', page='NextSong', level='paid', song='Home'),
 Row(userId='80', firstname='Tegan', page='NextSong', level='paid', song='OMG'),
 Row(userId='80', firstname='Tegan', page='Home', level='paid', song=None),
 Row(userId='80', firstname='Tegan', page='NextSong', level='paid', song='Candle On The Water'),
 Row(userId='80', firstname='Tegan', page='NextSong', level='paid', song='Our Song'),
 Row(userId='80', firstname='Tegan', page='NextSong', level='paid', song='Baby Boy [feat. Beyonce]'),
 Row(use

In [76]:
get_hour = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).hour)

In [77]:
df = df.withColumn("hour", get_hour(df.ts))

In [78]:
df.head()

Row(artist='Miami Horror', auth='Logged In', firstName='Kate', gender='F', itemInSession=88, lastName='Harrell', length=250.8273, level='paid', location='Lansing-East Lansing, MI', method='PUT', page='NextSong', registration=1540472624796.0, sessionId=293, song='Sometimes', status=200, ts=1541548876796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"', userId='97', hour='3')

In [79]:
songs_in_hour = df.filter(df.page == "NextSong") \
                    .groupby(df.hour) \
                    .count() \
                    .orderBy(df.hour.cast("float"))

In [80]:
songs_in_hour.show()

+----+-----+
|hour|count|
+----+-----+
|   0|   43|
|   1|   34|
|   2|   29|
|   3|   27|
|   4|    5|
|   5|    2|
|   6|    6|
|   8|   10|
|   9|   15|
|  10|   13|
|  11|   12|
|  12|   30|
|  13|   55|
|  14|   63|
|  15|   44|
|  16|   50|
|  17|   70|
|  18|   61|
|  19|   86|
|  20|   78|
+----+-----+
only showing top 20 rows



In [81]:
song_in_hour_pd = songs_in_hour.toPandas()

In [82]:
song_in_hour_pd.head()

Unnamed: 0,hour,count
0,0,43
1,1,34
2,2,29
3,3,27
4,4,5


In [83]:
df_valid = df.dropna(how = "any", subset = ["userId", "sessionId"])

In [84]:
df_valid.count()

1117

In [85]:
df_valid = df_valid.filter(df_valid["userId"] != "")

In [86]:
df_valid.count()

1089

In [None]:
# beofre and after specific event


In [91]:
flag_downgrade_event = udf(lambda x: 1 if x == "Downgrade" else 0, IntegerType())

In [92]:
df_valid = df_valid.withColumn("downgraded", flag_downgrade_event("page"))

In [94]:
df_valid.head()

Row(artist='Miami Horror', auth='Logged In', firstName='Kate', gender='F', itemInSession=88, lastName='Harrell', length=250.8273, level='paid', location='Lansing-East Lansing, MI', method='PUT', page='NextSong', registration=1540472624796.0, sessionId=293, song='Sometimes', status=200, ts=1541548876796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"', userId='97', hour='3', downgraded=0)

In [95]:
spark.stop()