In [1]:
# Purge the result directory
! rm -rf ./result/*

In [2]:
# Unzip the sample data
# % run unzip.py

In [3]:
# Run the pipeline
% run etl.py

In [4]:
# Create spark session
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [5]:
# Run the queries to read the local output

In [6]:
# Artists
df_artists=spark.read.parquet("result/artists/*.parquet")
df_artists.printSchema()
df_artists.show(10)

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)

+------------------+--------------------+--------------------+---------------+----------------+
|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longitude|
+------------------+--------------------+--------------------+---------------+----------------+
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|   Morris Plains, NJ|       40.82624|       -74.47995|
|AROUOZZ1187B9ABE51|         Willie Bobo|New York, NY [Spa...|       40.79195|       -73.94512|
|ARI2JSK1187FB496EF|Nick Ingman;Gavyn...|     London, England|       51.50632|        -0.12714|
|AREBBGV1187FB523D2|Mike Jones (Featu...|         Houston, TX|           null|            null|
|ARD842G1187B997376|          Blue Rodeo|Toronto, Ontario,...|       43.64856|       -79.38533|


In [7]:
# Songs
df_songs=spark.read.parquet("result/songs/*/*/*.parquet")
df_songs.printSchema()
df_songs.show(10)

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)

+------------------+--------------------+---------+
|           song_id|               title| duration|
+------------------+--------------------+---------+
|SOAOIBZ12AB01815BE|I Hold Your Hand ...| 43.36281|
|SONYPOM12A8C13B2D7|I Think My Wife I...|186.48771|
|SODREIN12A58A7F2E5|A Whiter Shade Of...|326.00771|
|SOYMRWW12A6D4FAB14|The Moon And I (O...| 267.7024|
|SOWQTQZ12A58A7B63E|Streets On Fire (...|279.97995|
|SOUDSGM12AC9618304|Insatiable (Instr...|266.39628|
|SOPEGZN12AB0181B3D|Get Your Head Stu...| 45.66159|
|SOBBUGU12A8C13E95D|Setting Fire to S...|207.77751|
|SOBAYLL12A8C138AF9|Sono andati? Fing...|511.16363|
|SOOLYAZ12A6701F4A6|Laws Patrolling (...|173.66159|
+------------------+--------------------+---------+
only showing top 10 rows



In [8]:
# Time
df_time=spark.read.parquet("result/time/*/*/*.parquet")
df_time.printSchema()
df_time.show(10)

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)

+--------------------+----+---+----+-------+
|          start_time|hour|day|week|weekday|
+--------------------+----+---+----+-------+
|2018-11-15 00:30:...|   0| 15|  46|      5|
|2018-11-15 00:41:...|   0| 15|  46|      5|
|2018-11-15 00:45:...|   0| 15|  46|      5|
|2018-11-15 03:44:...|   3| 15|  46|      5|
|2018-11-15 05:48:...|   5| 15|  46|      5|
|2018-11-15 05:53:...|   5| 15|  46|      5|
|2018-11-15 05:55:...|   5| 15|  46|      5|
|2018-11-15 06:01:...|   6| 15|  46|      5|
|2018-11-15 06:07:...|   6| 15|  46|      5|
|2018-11-15 06:10:...|   6| 15|  46|      5|
+--------------------+----+---+----+-------+
only showing top 10 rows



In [9]:
# Users
df_users=spark.read.parquet("result/users/*.parquet")
df_users.printSchema()
df_users.show(10)

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     88|  Mohammad|Rodriguez|     M| paid|
|     88|  Mohammad|Rodriguez|     M| free|
|     75|    Joseph|Gutierrez|     M| free|
|     11| Christian|   Porter|     F| free|
|     53|   Celeste| Williams|     F| free|
|     77| Magdalene|   Herman|     F| free|
|     69|  Anabelle|  Simpson|     F| free|
|     61|    Samuel| Gonzalez|     M| free|
|     45|  Dominick|   Norris|     M| free|
|     89|   Kynnedi|  Sanchez|     F| free|
+-------+----------+---------+------+-----+
only showing top 10 rows



In [10]:
# Songplays
df_songplays=spark.read.parquet("result/songplays/*/*/*.parquet")
df_songplays.printSchema()
df_songplays.show(10)

root
 |-- songplay_id: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)

+-----------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+
|songplay_id|          start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|
+-----------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+
|          0|2018-11-21 21:56:...|     15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|       818|Chicago-Napervill...|"Mozilla/5.0 (X11...|
+-----------+--------------------+-------+-----+------------------+---