# Configuration

In [33]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, to_date

In [34]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

create spark

In [35]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [36]:
# using test datasets
input_data='/home/workspace/data/'
output_data = "s3a://awsbucketofmxy/"

# Process song_data

In [33]:
# get filepath to song data file
song_data = input_data + "song-data/A/*/*/*.json"

In [41]:
# read song data file
df_song = spark.read.json(song_data)

print(df_song.count())
df_song.printSchema()

71
root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [42]:
df_song.createOrReplaceTempView("song_data")

In [43]:
# extract columns to create songs table
songs_table = spark.sql("""
    select 
        distinct (song_id) AS song_id, 
        title AS title,
        artist_id AS artist_id,
        year AS year,
        duration AS duration
    from song_data
    where song_id IS NOT NULL
""")
songs_table.show(5)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGNCJP12A58A80271|Do You Finally Ne...|ARB29H41187B98F0EF|1972|342.56934|
|SOOJPRH12A8C141995|   Loaded Like A Gun|ARBGXIG122988F409D|   0|173.19138|
|SOFCHDR12AB01866EF|         Living Hell|AREVWGE1187B9B890A|   0|282.43546|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [50]:
# write songs table to parquet files partitioned by year and artist
songs_table.write.parquet(output_data + 'songs', mode="overwrite")

In [47]:
# extract columns to create artists table
artists_table = spark.sql("""
    select 
        distinct (artist_id) AS artist_id, 
        artist_name AS name,
        artist_location AS location,
        artist_latitude AS lattitude,
        artist_longitude AS longitude
    from song_data
    where artist_id IS NOT NULL
""")
artists_table.show(5)

+------------------+------------+---------------+---------+----------+
|         artist_id|        name|       location|lattitude| longitude|
+------------------+------------+---------------+---------+----------+
|ARPBNLO1187FB3D52F|    Tiny Tim|   New York, NY| 40.71455| -74.00712|
|ARBEBBY1187B9B43DB|   Tom Petty|Gainesville, FL|     null|      null|
|AR0IAWL1187B9A96D0|Danilo Perez|         Panama|   8.4177| -80.11278|
|ARMBR4Y1187B9990EB|David Martin|California - SF| 37.77916|-122.42005|
|ARD0S291187B9B7BF5|     Rated R|           Ohio|     null|      null|
+------------------+------------+---------------+---------+----------+
only showing top 5 rows



In [49]:
# write artists table to parquet files
artists_table.write.parquet(output_data + 'artist', mode="overwrite")

# Process log data

In [37]:
log_data = input_data + "log-data/*.json"

In [38]:
# read log data file
df_log = spark.read.json(log_data)

print(df_log.count())
df_log.printSchema()

8056
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [39]:
# filter by actions for song plays
df_log = df_log.filter(df_log.page == 'NextSong')

In [13]:
df_log.createOrReplaceTempView("log_data")

In [25]:
# extract columns for users table    
users_table=spark.sql("""
    select 
        distinct (userId) AS user_id, 
        firstName AS first_name,
        lastName AS last_name,
        gender AS gender,
        level AS level
    from log_data
    where userId IS NOT NULL
""")
users_table.show(5)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     98|    Jordyn|   Powell|     F| free|
|     34|    Evelin|    Ayala|     F| free|
|     85|   Kinsley|    Young|     F| paid|
|     38|    Gianna|    Jones|     F| free|
|     85|   Kinsley|    Young|     F| free|
+-------+----------+---------+------+-----+
only showing top 5 rows



In [31]:
# write users table to parquet files
users_table.write.parquet(output_data + 'users', mode="overwrite")

In [21]:
df_log.select('ts').show()

+-------------+
|           ts|
+-------------+
|1542241826796|
|1542242481796|
|1542242741796|
|1542253449796|
|1542260935796|
|1542261224796|
|1542261356796|
|1542261662796|
|1542262057796|
|1542262233796|
|1542262434796|
|1542262456796|
|1542262679796|
|1542262728796|
|1542262893796|
|1542263158796|
|1542263378796|
|1542265716796|
|1542265929796|
|1542266927796|
+-------------+
only showing top 20 rows



In [40]:
# create timestamp column from original timestamp column
get_timestamp = udf(lambda x: int(int(x)/1000))
df_log =df_log.withColumn('timestamp', get_timestamp(df_log.ts))

df_log.select('timestamp').drop_duplicates().show(5)


+----------+
| timestamp|
+----------+
|1542210202|
|1541437215|
|1541440975|
|1542130757|
|1542135080|
+----------+
only showing top 5 rows



In [None]:
# create datetime column from original timestamp column
get_datetime = udf(lambda x: datetime.fromtimestamp(x))
df_log = df_log.withColumn('datetime', to_date(df_log.timestamp))

df_log.select('timestamp','datetime').drop_duplicates().show(5)

In [16]:
# extract columns to create time table
time_table = df_log.select('timestamp','datetime').drop_duplicates().alias('start_time') \
                            .withColumn("hour",hour('timest'))
time_table.show()

+----------+--------------------+----+
| timestamp|            datetime|hour|
+----------+--------------------+----+
|1542268264|java.util.Gregori...|null|
|1542276660|java.util.Gregori...|null|
|1542292689|java.util.Gregori...|null|
|1542308546|java.util.Gregori...|null|
|1542167868|java.util.Gregori...|null|
|1542185785|java.util.Gregori...|null|
|1542186006|java.util.Gregori...|null|
|1542218233|java.util.Gregori...|null|
|1541429782|java.util.Gregori...|null|
|1541430369|java.util.Gregori...|null|
|1541438926|java.util.Gregori...|null|
|1543548766|java.util.Gregori...|null|
|1543578383|java.util.Gregori...|null|
|1543586502|java.util.Gregori...|null|
|1542679534|java.util.Gregori...|null|
|1542743971|java.util.Gregori...|null|
|1543033890|java.util.Gregori...|null|
|1543489639|java.util.Gregori...|null|
|1543509693|java.util.Gregori...|null|
|1543515182|java.util.Gregori...|null|
+----------+--------------------+----+
only showing top 20 rows



In [None]:
# write time table to parquet files partitioned by year and month
time_table

In [None]:
# read in song data to use for songplays table
song_df = 

In [None]:
# extract columns from joined song and log datasets to create songplays table 
songplays_table = 

In [None]:
# write songplays table to parquet files partitioned by year and month
songplays_table