In [28]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, monotonically_increasing_id
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']
os.environ['AWS_REGION']=config['AWS']['AWS_REGION']

In [4]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [5]:
# get filepath to song data file
song_data = "s3a://udacity-dend/song_data/A/A/B/TRAABCL128F4286650.json"

In [6]:
# read song data file
df = spark.read.json(song_data)

In [7]:
df.show()

+------------------+---------------+---------------+----------------+------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|artist_location|artist_longitude| artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+---------------+----------------+------------+---------+---------+------------------+--------------------+----+
|ARC43071187B990240|           null|     Wisner, LA|            null|Wayne Watson|245.21098|        1|SOKEJEJ12A8C13E0D0|The Urgency (LP V...|   0|
+------------------+---------------+---------------+----------------+------------+---------+---------+------------------+--------------------+----+



In [8]:
# extract columns to create songs table
df.createOrReplaceTempView("songs")
songs_table = spark.sql("select distinct song_id, title, artist_id, year, duration from songs")

In [9]:
songs_table.show()

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOKEJEJ12A8C13E0D0|The Urgency (LP V...|ARC43071187B990240|   0|245.21098|
+------------------+--------------------+------------------+----+---------+



In [10]:
# write songs table to parquet files partitioned by year and artist
songs_table.write.partitionBy("year", "artist_id").parquet(path = "s3a://udacity-store/songs.parquet", mode = "overwrite")

In [11]:
# extract columns to create artists table
df.createOrReplaceTempView("artists")
artists_table = spark.sql("""select distinct artist_id, 
                            artist_name as name, 
                            artist_location as location, 
                            artist_latitude as latitude, 
                            artist_longitude as longitude 
                            from artists
                          """)

In [12]:
artists_table.show()

+------------------+------------+----------+--------+---------+
|         artist_id|        name|  location|latitude|longitude|
+------------------+------------+----------+--------+---------+
|ARC43071187B990240|Wayne Watson|Wisner, LA|    null|     null|
+------------------+------------+----------+--------+---------+



In [13]:
# write artists table to parquet files
artists_table.write.parquet(path = "s3a://udacity-store/artists.parquet", mode = "overwrite")

In [14]:
# get filepath to log data file
log_data = "s3a://udacity-dend/log_data/2018/11/2018-11-01-events.json"

In [15]:
# read log data file
df = spark.read.json(log_data)

In [16]:
df.show(1)

+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
|artist|     auth|firstName|gender|itemInSession|lastName|length|level|            location|method|page|     registration|sessionId|song|status|           ts|           userAgent|userId|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
|  null|Logged In|   Walter|     M|            0|    Frye|  null| free|San Francisco-Oak...|   GET|Home|1.540919166796E12|       38|null|   200|1541105830796|"Mozilla/5.0 (Mac...|    39|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
only showing top 1 row



In [17]:
# filter by actions for song plays
df.createOrReplaceTempView("stg_events")
songplays_table = spark.sql("select * from stg_events where page='NextSong'")

In [18]:
songplays_table.show()

+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|             Des'ree|Logged In|   Kaylee|     F|            1| Summers|246.30812| free|Phoenix-Mesa-Scot...|   PUT|NextSong|1.540344794796E12|      139|        You Gotta Be|   200|1541106106796|"Mozilla/5.0 (Win...|     8|
|             Mr Oizo|Logged In|   Kaylee|     F|            3| Summers|144.03873| free|Phoenix-Mesa-Sco

In [19]:
# extract columns for users table
df.createOrReplaceTempView("users")
users_table = spark.sql("select distinct userId, firstName, lastName, gender, level from users")

In [20]:
users_table.show()

+------+---------+--------+------+-----+
|userId|firstName|lastName|gender|level|
+------+---------+--------+------+-----+
|    39|   Walter|    Frye|     M| free|
|     8|   Kaylee| Summers|     F| free|
|    10|   Sylvie|    Cruz|     F| free|
|    26|     Ryan|   Smith|     M| free|
|   101|   Jayden|     Fox|     M| free|
+------+---------+--------+------+-----+



In [21]:
# write users table to parquet files
users_table.write.parquet(path = "s3a://udacity-store/users.parquet", mode = "overwrite")

In [51]:
# create timestamp column from original timestamp column
get_timestamp = udf(lambda x: str(int(int(x)/1000)))
df = df.withColumn('time_stamp', get_timestamp(df.ts))

In [None]:
# create datetime column from original timestamp column
get_timestamp = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
df = df.withColumn("start_time", get_timestamp("ts"))

In [None]:
df.createOrReplaceTempView("time")
time_table = spark.sql("""select distinct time_stamp as start_time, 
                                        hour(start_time) as hour, 
                                        day(start_time) as day, 
                                        weekofyear(start_time) as week, 
                                        month(start_time) as month, 
                                        year(start_time) as year, 
                                        dayofweek(start_time) as weekday 
                           from time
                       """)

In [52]:
time_table.show()

+-------------+----+---+----+-----+----+-------+
|   start_time|hour|day|week|month|year|weekday|
+-------------+----+---+----+-----+----+-------+
|1541106352796|  21|  1|  44|   11|2018|      5|
|1541107734796|  21|  1|  44|   11|2018|      5|
|1541105830796|  20|  1|  44|   11|2018|      5|
|1541106496796|  21|  1|  44|   11|2018|      5|
|1541109015796|  21|  1|  44|   11|2018|      5|
|1541107493796|  21|  1|  44|   11|2018|      5|
|1541110994796|  22|  1|  44|   11|2018|      5|
|1541109325796|  21|  1|  44|   11|2018|      5|
|1541106106796|  21|  1|  44|   11|2018|      5|
|1541108520796|  21|  1|  44|   11|2018|      5|
|1541109125796|  21|  1|  44|   11|2018|      5|
|1541107053796|  21|  1|  44|   11|2018|      5|
|1541106132796|  21|  1|  44|   11|2018|      5|
|1541106673796|  21|  1|  44|   11|2018|      5|
+-------------+----+---+----+-----+----+-------+



In [24]:
# write time table to parquet files partitioned by year and month
time_table.write.partitionBy("year", "month").parquet(path = "s3a://udacity-store/time.parquet", mode = "overwrite")

In [25]:
# read in song data to use for songplays table
song_df = spark.read.parquet("s3a://udacity-store/songs.parquet")
song_df.createOrReplaceTempView("stg_songs")

In [49]:
# extract columns from joined song and log datasets to create songplays table

songplays_table = spark.sql("""select
                    events.ts start_time,
                    events.userId user_id,
                    events.level,
                    songs.song_id,
                    songs.artist_id,
                    events.sessionId session_id,
                    events.location,
                    events.userAgent user_agent
                    from stg_events events inner join stg_songs songs
                    on events.song=songs.title
                    where events.page='NextSong'
"""
)

In [71]:
songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id())

In [72]:
songplays_table.show()

+-------------+-------+-----+----------+--------------------+--------------------+-----------+
|   start_time|user_id|level|session_id|            location|          user_agent|songplay_id|
+-------------+-------+-----+----------+--------------------+--------------------+-----------+
|1541105830796|     39| free|        38|San Francisco-Oak...|"Mozilla/5.0 (Mac...|          0|
|1541106106796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|          1|
|1541106106796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|          2|
|1541106132796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|          3|
|1541106352796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|          4|
|1541106496796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|          5|
|1541106673796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|          6|
|1541107053796|      8| free|       139|Phoenix-Me

In [75]:
songplays_joined_time = songplays_table.join(time_table, (songplays_table.start_time == time_table.start_time), how="inner") \
.select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month")

In [77]:
songplays_joined_time.show()

+-----------+-------------+-------+-----+----------+--------------------+--------------------+----+-----+
|songplay_id|   start_time|user_id|level|session_id|            location|          user_agent|year|month|
+-----------+-------------+-------+-----+----------+--------------------+--------------------+----+-----+
|          0|1541105830796|     39| free|        38|San Francisco-Oak...|"Mozilla/5.0 (Mac...|2018|   11|
|          1|1541106106796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|2018|   11|
|          2|1541106106796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|2018|   11|
|          3|1541106132796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|2018|   11|
|          4|1541106352796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|2018|   11|
|          5|1541106496796|      8| free|       139|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|2018|   11|
|          6|1541106673796|      8| free|     

In [78]:
# write songplays table to parquet files partitioned by year and month
songplays_joined_time.write.partitionBy("year", "month").parquet(path = "s3a://udacity-store/songplays.parquet", mode = "overwrite")