In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
import logging
logging.basicConfig(
    format='[ %(asctime)s ] %(filename)s(%(lineno)d) %(levelname)s - %(message)s',
    level=logging.INFO
)

config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .master("local[4]") \
        .appName("sparkify-ETL")  \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .config("spark.sql.autoBroadcastJoinThreshold", -1) \
        .getOrCreate()
    return spark


In [4]:
spark = create_spark_session()
# input_data = "s3a://udacity-dend/"
# output_data = "s3a://udacity-lesson3-project-bucket/"
output_data = "./output_data/"

In [61]:
songplays_df = spark.read.parquet(f"{output_data}songplays_table.parquet")
songplays_df.createOrReplaceTempView("songplays")

songs_df = spark.read.parquet(f"{output_data}songs_table.parquet")
songs_df.createOrReplaceTempView("songs")

artists_df = spark.read.parquet(f"{output_data}artists_table.parquet")
artists_df.createOrReplaceTempView("artists")

users_df = spark.read.parquet(f"{output_data}users_table.parquet")
users_df.createOrReplaceTempView("users")

time_df = spark.read.parquet(f"{output_data}time_table.parquet")
time_df.createOrReplaceTempView("time")

In [43]:
songplays_df.toPandas()

Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,0,,15,paid,SOZCTXZ12AB0182364,AR5KOSW1187FB35FF4,818,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5..."
1,1,,10,free,SOGDBUF12A8C140FAA,AR558FS1187FB45658,484,"Washington-Arlington-Alexandria, DC-VA-MD-WV","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
2,2,,24,paid,SOGDBUF12A8C140FAA,AR558FS1187FB45658,672,"Lake Havasu City-Kingman, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK..."
3,3,,80,paid,SOGDBUF12A8C140FAA,AR558FS1187FB45658,992,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."


### Get the total number of users

In [8]:
spark.sql("""
    SELECT COUNT(*)
    FROM users
""").toPandas()

Unnamed: 0,count(1)
0,96


### Get the top songs

In [12]:
spark.sql("""
SELECT s.song_id, s.title, COUNT(*) as num_listens
FROM songplays AS sp
JOIN songs AS s ON s.song_id = sp.song_id
GROUP BY s.song_id, s.title
ORDER BY num_listens DESC
LIMIT 10
""").toPandas()

Unnamed: 0,song_id,title,num_listens
0,SOGDBUF12A8C140FAA,Intro,3
1,SOZCTXZ12AB0182364,Setanta matins,1


### Get the top artists

In [16]:
spark.sql("""
SELECT a.artist_id, a.name, COUNT(*) as num_listens
FROM songplays AS sp 
JOIN artists AS a ON a.artist_id = sp.artist_id
GROUP BY a.artist_id, a.name
ORDER BY num_listens DESC
LIMIT 10
""").toPandas()

Unnamed: 0,artist_id,name,num_listens
0,AR558FS1187FB45658,40 Grit,3
1,AR5KOSW1187FB35FF4,Elena,1


### Number of song listens we had in the months of 2018

In [62]:
spark.sql("""
SELECT t.year, t.month, COUNT(*) as num_listens
FROM songplays AS sp
JOIN time AS t ON t.start_time = sp.start_time
WHERE t.year = 2018
GROUP BY t.year, t.month
ORDER BY num_listens DESC
LIMIT 10
""").toPandas()

Unnamed: 0,year,month,num_listens
0,2018,11,4
