In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg
from pyspark.sql import Window
from pyspark import SparkConf, SparkContext

import datetime

import numpy as np
import pandas as pd
import configparser

import pyspark

%matplotlib inline
import matplotlib.pyplot as plt

import findspark
findspark.init()

In [19]:
spark = SparkSession \
    .builder \
    .appName("Wrangling Data") \
    .getOrCreate()

In [20]:
# Create SPark dataframe

path = "D:/OneDrive/Data Engineering/Udacity/Data Engineering/DataEngineering_Repo/Spark/sparkify_event_data.json"
user_log=spark.read.json(path)

In [21]:
# Create temporary view

user_log.createOrReplaceTempView("user_log_table")

In [22]:
spark.sql('''
        SELECT *
        FROM user_log_table
        LIMIT 2
        '''
        ).show()

+-----------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+-------------+------+-------------+--------------------+------+
|           artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|         song|status|           ts|           userAgent|userId|
+-----------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+-------------+------+-------------+--------------------+------+
|    Martin Orford|Logged In|   Joseph|     M|           20| Morales|597.55057| free|  Corpus Christi, TX|   PUT|NextSong|1532063507000|      292|Grand Designs|   200|1538352011000|"Mozilla/5.0 (Mac...|   293|
|John Brown's Body|Logged In|   Sawyer|     M|           74|  Larson|380.21179| free|Houston-The Woodl...|   PUT|NextSong|1538069638000|       97|        Bulls|

In [23]:
spark.sql('''
        SELECT COUNT(*)
        FROM user_log_table
        '''
        ).show()

+--------+
|count(1)|
+--------+
|  543705|
+--------+



In [24]:
spark.sql('''
        SELECT DISTINCT page
        FROM user_log_table
        ORDER BY page ASC  
        ''').show()

+--------------------+
|                page|
+--------------------+
|               About|
|          Add Friend|
|     Add to Playlist|
|              Cancel|
|Cancellation Conf...|
|           Downgrade|
|               Error|
|                Help|
|                Home|
|               Login|
|              Logout|
|            NextSong|
|            Register|
|         Roll Advert|
|       Save Settings|
|            Settings|
|    Submit Downgrade|
| Submit Registration|
|      Submit Upgrade|
|         Thumbs Down|
+--------------------+
only showing top 20 rows



In [25]:
# Lambda function converting timestamp to datetime

spark.udf.register("get_hour", lambda x: int(datetime.datetime.fromtimestamp(x / 1000.0).hour))

<function __main__.<lambda>(x)>

In [33]:
spark.sql('''
        SELECT *, get_hour(ts) AS hour
        FROM user_log_table
        LIMIT 1
        '''
        ).collect()

[Row(artist='Martin Orford', auth='Logged In', firstName='Joseph', gender='M', itemInSession=20, lastName='Morales', length=597.55057, level='free', location='Corpus Christi, TX', method='PUT', page='NextSong', registration=1532063507000, sessionId=292, song='Grand Designs', status=200, ts=1538352011000, userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='293', hour='2')]

In [34]:
songs_in_hour = spark.sql('''
        SELECT get_hour(ts) AS hour, COUNT(*) as plays_per_hour
        FROM user_log_table
        WHERE page = "NextSong"
        GROUP BY hour
        ORDER BY cast(hour as int) ASC
        '''
        )

In [35]:
songs_in_hour.show()

+----+--------------+
|hour|plays_per_hour|
+----+--------------+
|   0|         18758|
|   1|         17657|
|   2|         17425|
|   3|         16569|
|   4|         15862|
|   5|         15420|
|   6|         15386|
|   7|         15130|
|   8|         15116|
|   9|         15016|
|  10|         15258|
|  11|         15427|
|  12|         16021|
|  13|         16751|
|  14|         17773|
|  15|         19025|
|  16|         21088|
|  17|         22255|
|  18|         22204|
|  19|         22564|
+----+--------------+
only showing top 20 rows



In [37]:
songs_in_hour_pd = songs_in_hour.toPandas()
print(songs_in_hour_pd)

   hour  plays_per_hour
0     0           18758
1     1           17657
2     2           17425
3     3           16569
4     4           15862
5     5           15420
6     6           15386
7     7           15130
8     8           15116
9     9           15016
10   10           15258
11   11           15427
12   12           16021
13   13           16751
14   14           17773
15   15           19025
16   16           21088
17   17           22255
18   18           22204
19   19           22564
20   20           21809
21   21           20572
22   22           20392
23   23           19399
