# Start spark session
First, configure the spark session which allows pyspark to be imported.  Then we don't need to create the spark session, but can import other pyspark functionality.  This should be using the PySpark kernel.

In [1]:
spark

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1582233883914_0019,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f793a598a58>

In [2]:
sc = spark.sparkContext

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
import configparser
from datetime import datetime
import os
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import monotonically_increasing_id


# config = configparser.ConfigParser()
# config.read('./dl.cfg')

# os.environ['AWS_ACCESS_KEY_ID']=config.get('AWS', 'AWS_ACCESS_KEY_ID')
# os.environ['AWS_SECRET_ACCESS_KEY']=config.get('AWS', 'AWS_SECRET_ACCESS_KEY')


def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
input_data = "s3a://udacity-dend/"
# different bucket for EMR notebook vs udacity workspace notebook
# output_data = "s3a://udacity-dend-spark-dwh2/"
output_data = "hdfs:///dwh/"

# maybe speeds up s3 writes?
# https://stackoverflow.com/a/42834182/4549682
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.algorithm.version", "2")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
sc

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<SparkContext master=yarn appName=livy-session-6>

In [6]:
# takes around 141s from udacity bucket with 3-node cluster
# with 5-node cluster, around 100-110s

# get filepath to song data file
song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

# read song data -- takes about 3 minutes on a 3-node 5m.xl cluster
start = time.time()
df = spark.read.json(song_data)
end = time.time()
print('took', int(end-start), 'seconds')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 94 seconds

In [7]:
# why does this take so long? makes no sense; it takes like 20-30s
df.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

14896

In [8]:
df.head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(artist_id='AR4T2IF1187B9ADBB7', artist_latitude=63.96027, artist_location='<a href="http://billyidol.net" onmousedown=\'UntrustedLink.bootstrap($(this), "fc44f8f60d13ab68c56b3c6709c6d670", event)\' target="_blank" rel="nofollow">http://billyidol.net</a>', artist_longitude=10.22442, artist_name='Billy Idol', duration=233.22077, num_songs=1, song_id='SOVIYJY12AF72A4B00', title='The Dead Next Door (Digitally Remastered 99)', year=1983)

In [9]:
df.dtypes

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('artist_id', 'string'), ('artist_latitude', 'double'), ('artist_location', 'string'), ('artist_longitude', 'double'), ('artist_name', 'string'), ('duration', 'double'), ('num_songs', 'bigint'), ('song_id', 'string'), ('title', 'string'), ('year', 'bigint')]

# Write songs table.

In [10]:
os.path.join(output_data, 'songs')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'hdfs:///dwh/songs'

In [11]:
# with default settings, takes at least 30 minutes with a 5-node cluster m5.xlarge
# takes ~60s to write to HDFS
# with fileoutputcommitter, takes ~730s to write to S3

start = time.time()
# extract columns to create songs table
songs_cols = ['song_id', 'title', 'artist_id', 'year', 'duration']
songs_table = df.select(songs_cols).dropDuplicates()

# write songs table to parquet files partitioned by year and artist
songs_table.write.mode('overwrite').partitionBy('year', 'artist_id').parquet(os.path.join(output_data, 'songs'))
end = time.time()
print('took', int(end-start), 's to write songs table')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 49 s to write songs table

In [12]:
end = time.time()
print('took', int(end-start), 's to write songs table')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 51 s to write songs table

# Write artists table.

In [13]:
start = time.time()
# extract columns to create artists table
artists_cols = ['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']
artists_table = df.select(artists_cols).dropDuplicates()

# write artists table to parquet files
artists_table.write.mode('overwrite').parquet(os.path.join(output_data, 'artists'))
end = time.time()
print('took', int(end-start), 's to write artists table')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 21 s to write artists table

# Read log data.

In [14]:
# get filepath to log data file
# on s3, paths are log_data/year/month/date.json
# in workspace, path is logdata/date.json
# s3 path
log_data = os.path.join(input_data, 'log_data/*/*/*.json')
# workspace path
# log_data = os.path.join(input_data, 'log_data/*.json')

# read log data file
start = time.time()
df = spark.read.json(log_data)
end = time.time()
print('took', int(end-start), 'seconds')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 0 seconds

In [15]:
df.head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26')

In [16]:
df.columns

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['artist', 'auth', 'firstName', 'gender', 'itemInSession', 'lastName', 'length', 'level', 'location', 'method', 'page', 'registration', 'sessionId', 'song', 'status', 'ts', 'userAgent', 'userId']

In [17]:
df.dtypes

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('artist', 'string'), ('auth', 'string'), ('firstName', 'string'), ('gender', 'string'), ('itemInSession', 'bigint'), ('lastName', 'string'), ('length', 'double'), ('level', 'string'), ('location', 'string'), ('method', 'string'), ('page', 'string'), ('registration', 'double'), ('sessionId', 'bigint'), ('song', 'string'), ('status', 'bigint'), ('ts', 'bigint'), ('userAgent', 'string'), ('userId', 'string')]

# Write users table.

In [18]:
# filter by actions for song plays
df = df.filter(df['page'] == 'NextSong')

start = time.time()
# extract columns for users table
users_cols = ['userId', 'firstName', 'lastName', 'gender', 'level']
users_table = df.select(users_cols).dropDuplicates()

# write users table to parquet files
users_table.write.mode('overwrite').parquet(os.path.join(output_data, 'users'))
end = time.time()
print('took', int(end-start), 's to write users table')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 1 s to write users table

In [19]:
# create datetime column from original timestamp column
# timestamp is is ms
get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType())
df = df.withColumn('start_time', get_datetime('ts'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Write time table.

In [20]:
# extract columns to create time table

time_table = df.select("start_time").dropDuplicates() \
                        .withColumn("hour", hour(col("start_time"))) \
                        .withColumn("day", dayofmonth(col("start_time"))) \
                        .withColumn("week", weekofyear(col("start_time"))) \
                        .withColumn("month", month(col("start_time"))) \
                        .withColumn("year", year(col("start_time"))) \
                        .withColumn("weekday", date_format(col("start_time"), 'E'))

start = time.time()
# write time table to parquet files partitioned by year and month
time_table.write.mode('overwrite').partitionBy('year', 'month').parquet(os.path.join(output_data, 'time'))
end = time.time()

print('took', int(end-start), 's to write time table')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 6 s to write time table

# Create and write songplays table.

In [21]:
# read in song data to use for songplays table
songs_df = spark.read.parquet(os.path.join(output_data, 'songs/*/*/*'))

# read in artist data
artists_df = spark.read.parquet(os.path.join(output_data + 'artists/*'))

# extract columns from joined song and log datasets to create songplays table
# columns desired: songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
# we are joining the songs and artists data with logs to get the song_id and artist_id from the songs data
# need to join on song title and artist name

# first join songs and logs dfs on song title
songs_logs_df = df.join(songs_df, (df.song == songs_df.title))
# next join that df with artists on artist name
artists_songs_logs_df = songs_logs_df.join(artists_df, (songs_logs_df.artist == artists_df.artist_name))


songplay_cols = ['start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent']
# calculate year and month from start_time -- probably faster than a join on the time table
songplays_table = artists_songs_logs_df.select(songplay_cols) \
                    .withColumn('songplay_id', monotonically_increasing_id()) \
                    .withColumn("month", month(col("start_time"))) \
                    .withColumn("year", year(col("start_time")))

# write songplays table to parquet files partitioned by year and month
start = time.time()
songplays_table.write.mode('overwrite').partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays'))
end = time.time()
print('took', int(end-start), 's to write songplays table')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

took 13 s to write songplays table