In [1]:
# Parse app's configurations
import configparser

config = configparser.ConfigParser()
config.read('dl.cfg')

AWS_KEY = config.get("AWS", "KEY")
AWS_SECRET = config.get("AWS", "SECRET")
DATA_PATH = config.get("S3", "PROCESSED_DATA_PATH")

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [3]:
# Config & Initialize PySpark SparkSession
spark_cfg = SparkConf()

# Set AWS Credentials, letting Spark access to S3 services
spark_cfg.set('fs.s3a.access.key', AWS_KEY)
spark_cfg.set('fs.s3a.secret.key', AWS_SECRET)

# Config Spark's underlying Hadoop, letting Spark read/write to S3
spark_cfg.set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
spark_cfg.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
spark_cfg.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
spark_cfg.set("fs.s3a.connection.ssl.enabled", "true")

spark = SparkSession\
        .builder\
        .config(conf=spark_cfg)\
        .appName('Test Queries')\
        .getOrCreate()

In [4]:
# Test query - Song Plays
spark.sql(f'CREATE OR REPLACE TEMPORARY VIEW SongPlays USING PARQUET OPTIONS (path \"{DATA_PATH}songplays\")')
spark.sql("SELECT * FROM SongPlays LIMIT 5").toPandas()

Unnamed: 0,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent,songplay_id,year,month
0,2018-11-03 21:14:28,49,free,SOFVOQL12A6D4F7456,ARPN0Y61187B9ABAA0,195,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,1,2018,11
1,2018-11-05 17:49:42,73,paid,SOHDWWH12A6D4F7F6A,ARC0IOF1187FB3F6E6,255,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",2,2018,11
2,2018-11-08 15:01:57,29,paid,SOFVOQL12A6D4F7456,ARPN0Y61187B9ABAA0,372,"Atlanta-Sandy Springs-Roswell, GA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",3,2018,11
3,2018-11-09 17:55:00,80,paid,SOAOJYY12A58A7B2F9,ARFVYJI1187B9B8E13,416,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",4,2018,11
4,2018-11-09 19:57:57,36,paid,SODWXQV12A6310F10D,AR6892W1187B9AC71B,392,"Janesville-Beloit, WI","""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537....",5,2018,11


In [5]:
# Test query - Songs
spark.sql(f'CREATE OR REPLACE TEMPORARY VIEW Songs USING PARQUET OPTIONS (path \"{DATA_PATH}songs\")')
spark.sql("SELECT * FROM Songs LIMIT 5").toPandas()

Unnamed: 0,song_id,title,duration,year,artist_id
0,SOBTCUI12A8AE48B70,Faust: Ballet Music (1959 Digital Remaster): V...,94.56281,0,ARSUVLW12454A4C8B8
1,SOVNKJI12A8C13CB0D,Take It To Da House (Featuring The Slip N' Sli...,227.10812,2001,ARWUNH81187FB4A3E0
2,SOYVBGZ12A6D4F92A8,Piano Sonata No. 21 in C 'Waldstein' Op. 53: I...,221.70077,0,ARLRWBW1242077EB29
3,SODBHKO12A58A77F36,Fingers Of Love (Live At Sydney Opera House) (...,335.93424,0,ARKGS2Z1187FB494B5
4,SOGXFIF12A58A78CC4,Hanging On (Medium Key Performance Track Witho...,204.06812,0,AR5LZJD1187FB4C5E5


In [6]:
# Test query - Artists
spark.sql(f'CREATE OR REPLACE TEMPORARY VIEW Artists USING PARQUET OPTIONS (path \"{DATA_PATH}artists\")')
spark.sql("SELECT * FROM Artists LIMIT 5").toPandas()

Unnamed: 0,artist_id,name,location,latitude,longitude
0,ARPQ4Z01187FB3A736,Butthole Surfers,"San Antonio, TX",29.42449,-98.49462
1,ARPN0Y61187B9ABAA0,The Smiths,"Manchester, England",53.4796,-2.24881
2,ARDQKMM1187FB3F4F0,Roky Erickson,"Dallas, TX",32.77815,-96.7954
3,ARJWKMA1187B9A9D11,Lisa Germano,"Mishawaka, IN",41.66017,-86.17258
4,AR1RHCO1187B9AF0BF,Vince Guaraldi / Bola Sete,"San Francisco, CA",37.77916,-122.42005


In [7]:
# Test query - Users
spark.sql(f'CREATE OR REPLACE TEMPORARY VIEW Users USING PARQUET OPTIONS (path \"{DATA_PATH}users\")')
spark.sql("SELECT * FROM Users LIMIT 5").toPandas()

Unnamed: 0,user_id,first_name,last_name,gender,level
0,22,Sean,Wilson,F,free
1,28,Brantley,West,M,free
2,3,Isaac,Valdez,M,free
3,30,Avery,Watkins,F,paid
4,34,Evelin,Ayala,F,free


In [8]:
# Test query - Time
spark.sql(f'CREATE OR REPLACE TEMPORARY VIEW Time USING PARQUET OPTIONS (path \"{DATA_PATH}time\")')
spark.sql("SELECT * FROM Time LIMIT 5").toPandas()

Unnamed: 0,start_time,hour,day,week,weekday,year,month
0,2018-11-30 15:28:30,15,30,48,6,2018,11
1,2018-11-30 10:59:23,10,30,48,6,2018,11
2,2018-11-13 18:19:01,18,13,46,3,2018,11
3,2018-11-30 14:47:35,14,30,48,6,2018,11
4,2018-11-05 16:37:59,16,5,45,2,2018,11


**Top 10 most played songs:**

In [9]:
spark.sql("""
SELECT  sp.song_id,
        s.title,
        COUNT(*) AS played
FROM SongPlays sp
JOIN Songs s
    ON sp.song_id = s.song_id
GROUP BY sp.song_id, s.title
ORDER BY played DESC
LIMIT 10
""").toPandas()

Unnamed: 0,song_id,title,played
0,SOCHRXB12A8AE48069,Let's Get It Started,3
1,SOFVOQL12A6D4F7456,The Boy With The Thorn In His Side,2
2,SOXQYSC12A6310E908,Bitter Sweet Symphony,2
3,SONQBUB12A6D4F8ED0,Angie (1993 Digital Remaster),2
4,SOLRYQR12A670215BF,Panama (Remastered Album Version),1
5,SODOLVO12B0B80B2F4,Hell,1
6,SODWXQV12A6310F10D,English Summer Rain,1
7,SOHDWWH12A6D4F7F6A,I Want A New Drug,1
8,SOZCTXZ12AB0182364,Setanta matins,1
9,SOAOJYY12A58A7B2F9,Not For You,1


**Top 10 artists in November, 2018:**

In [10]:
spark.sql("""
SELECT  sp.artist_id,
        a.name,
        COUNT(*) AS played
FROM SongPlays sp
JOIN Artists a
    ON sp.artist_id = a.artist_id
JOIN Time t
    ON sp.start_time = t.start_time
WHERE t.year = 2018 AND t.month = 11
GROUP BY sp.artist_id, a.name
ORDER BY played DESC
LIMIT 10
""").toPandas()

Unnamed: 0,artist_id,name,played
0,ARTDQRC1187FB4EFD4,Black Eyed Peas,3
1,ARFCUN31187B9AD578,The Rolling Stones,2
2,ARPN0Y61187B9ABAA0,The Smiths,2
3,AR0L04E1187B9AE90C,The Verve,2
4,AR5KOSW1187FB35FF4,Elena,1
5,AR3ZL6A1187B995B37,De-Phazz,1
6,ARFVYJI1187B9B8E13,Pearl Jam,1
7,ARNLO5S1187B9B80CC,Van Halen,1
8,AR6892W1187B9AC71B,Placebo,1
9,AR6XPWV1187B9ADAEB,Foo Fighters,1


**The artist with the most songs:**

In [11]:
spark.sql("""
SELECT  s.artist_id,
        a.name,
        COUNT(*) AS songs_wrote
FROM Artists a
JOIN Songs s
    ON s.artist_id = a.artist_id
GROUP BY s.artist_id, a.name
ORDER BY songs_wrote DESC
LIMIT 1
""").toPandas()

Unnamed: 0,artist_id,name,songs_wrote
0,ARYPTWE1187FB49D64,Polygon Window,4
