In [1]:
import configparser
from datetime import datetime
import os
import pandas as pd
import calendar
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, year, month, dayofmonth, hour, weekofyear, date_format, monotonically_increasing_id

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ["AWS_ACCESS_KEY_ID"]= config['default']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['default']['AWS_SECRET_ACCESS_KEY']

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.4") \
        .getOrCreate()
    return spark

In [4]:
def process_song_data(spark, input_data, output_data):
   # get filepath to song data file
   song_data = os.path.join(input_data, "song_data/A/*/*/*.json")

   # read song data file
   print ("    Reading song data file") 
   df = spark.read.json(song_data)

   # write songs table to parquet files partitioned by year and artist
   print ("    Writing songs table to parquet files")
   df.select("song_id", "title", "artist_id", "year", "duration") \
      .write.mode("overwrite") \
      .partitionBy("year","artist_id") \
      .parquet(output_data + "songs")

   print ("    Writing artists table to parquet files")
   # write artists table to parquet files
   df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \
      .write.mode("overwrite") \
      .parquet(output_data + "artists")
      
   df.createOrReplaceTempView("songsData")

In [5]:
def process_log_data(spark, input_data, output_data):
    
    # get filepath to log data file
    log_data = os.path.join("data/log_data/*/*/*.json")

    # read log data file
    print ("    Reading log data file") 
    df = spark.read.json(log_data)

    # filter by actions for song plays
    dfNextSong = df.filter(df.page == "NextSong")
    
    # write users table to parquet files
    print ("    Writing users table to parquet files")

    df.select(df.userId.alias("user_id"), df.firstName.alias("first_name") , \
        df.lastName.alias("last_name"), "gender", "level") \
        .distinct() \
        .write.mode("overwrite") \
        .parquet(output_data + "users")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x)/1000)))
    df = df.withColumn('timestamp', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x)/1000)))
    get_week = udf(lambda x: calendar.day_name[x.weekday()])
    get_weekday = udf(lambda x: x.isocalendar()[1])
    get_hour = udf(lambda x: x.hour)
    get_day = udf(lambda x : x.day)
    get_year = udf(lambda x: x.year)
    get_month = udf(lambda x: x.month)

    # extract columns to create time table
    df = df.withColumn('start_time', get_datetime(df.ts))
    df = df.withColumn('hour', get_hour(df.start_time))
    df = df.withColumn('day', get_day(df.start_time))
    df = df.withColumn('week', get_week(df.start_time))
    df = df.withColumn('month', get_month(df.start_time))
    df = df.withColumn('year', get_year(df.start_time))
    df = df.withColumn('weekday', get_weekday(df.start_time))

  
    # write time table to parquet files partitioned by year and month
    print ("    Writing time table to parquet files")
    df.select('start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday') \
        .write.mode("overwrite") \
        .parquet(output_data + "time")

    # read in song data to use for songplays table
    song_df = spark.sql("SELECT DISTINCT song_id, artist_id, artist_name FROM songsData")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, song_df.artist_name == df.artist, "inner") \
        .distinct() \
        .select(col("start_time"), col("userId"), col("level"), col("sessionId"), \
                col("location"), col("userAgent"), col("song_id"), col("artist_id")) \
        .withColumn("songplay_id", monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    print ("    Writing songplays table to parquet files")
    songplays_table.select('*') \
      .write.mode("overwrite") \
      .parquet(output_data + "songplays")

spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.4") \
    .getOrCreate()

log_data = os.path.join("data/log_data/*/*/*.json")

# read log data file
print ("    Reading log data file") 
df = spark.read.json(log_data)
dfNextSong = df.filter(df.page == "NextSong")
dfNextSong.toPandas()

In [6]:
print ("1. Creating/Getting Spark session")
start_time = time.time()
spark = create_spark_session()
print("--- It took %s seconds ---" % (time.time() - start_time))

#input_data = "s3a://udacity-dend/"
input_data = "data/"
output_data = "data/parquets/"

start_time = time.time()
print ("Starting SONG data processing")    
process_song_data(spark, input_data, output_data)
print("--- It took %s seconds ---" % (time.time() - start_time))

start_time = time.time()
print ("Starting LOG data processing")
process_log_data(spark, input_data, output_data)
print("--- It took %s seconds ---" % (time.time() - start_time))

Creating/Getting Spark session
Starting song data processing
    Reading song data file
    Writing songs table to parquet files
    Writing artists table to parquet files
Starting LOG data processing
    Reading log data file
    Writing users table to parquet files
    Writing time table to parquet files
    Writing songplays table to parquet files
