In [1]:
# ---------------------------------------------------------------------------------------------------------------
# creating table f_songplays
# ---------------------------------------------------------------------------------------------------------------
"""
Create table f_songplays:

- Join the source data frames "df_log_table" and "df_songs_table" from file "Project4-DataLake_JupyterNB_process_song_data.ipynb")
  together by the following columns:
    - df_log_table.artist = df_songs_table.artist_name
    - df_log_table.song = df_songs_table.title
- Filter dataset by "df_log_table.page = 'NextSong'" to reduce loaded data to the important entries for analysis
"""


"""
************************************************************************************************************************
Import needed data from data_frame df_songs_table
************************************************************************************************************************
"""

"""
This file in only for developing purposes, because it's not needed to create a Spark Session each time
"""
import configparser
import os
from pyspark.sql import SparkSession, Window

In [2]:
"""
Set global variables to access S3 via OS environment variables
"""
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

"""
Create Spark Session and setting LOG-Level
Attributes:
    ERROR   - less details - only when something went wrong
    WARN    - more details
    INFO    - more details than Warn level
    DEBUG   - very detailed information
"""
spark = SparkSession \
    .builder \
    .appName("etl pipeline for project 4 - Data Lake") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

# setting the current LOG-Level
spark.sparkContext.setLogLevel('ERROR')

In [3]:
"""
Load data frame from former written PARQUET file
Data to write: d_songs
"""
df_songs_table = spark.read.parquet('s3a://project-4-data-lake/analytics/staging/df_songs_table.parquet')

In [4]:
df_songs_table.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: integer (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [5]:
"""
Load data frame from former written PARQUET file
Data to write: d_songs
"""

df_logs_table = spark.read.parquet('s3a://project-4-data-lake/analytics/staging/df_logs_table.parquet')

In [6]:
df_logs_table.printSchema()


root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)



In [7]:
"""
Prepare f_songplays Fact table

# Tutorial for a JOIN with Data Frames: http://www.learnbymarketing.com/1100/pyspark-joins-by-example/
LEFT ANTI JOIN
"""

# check loaded data
print("All entries: " + str(df_logs_table.count()))
print("Filter by 'NextSong (only count)'" + str(df_logs_table.filter(df_logs_table.page == "NextSong").count()))
print("dropDuplicates: " + str(df_logs_table.dropDuplicates().count()))
print("dropna: " + str(df_logs_table.dropna().count()))
print("All entries: " + str(df_logs_table.count()))

All entries: 15
Filter by 'NextSong (only count)'11
dropDuplicates: 15
dropna: 11
All entries: 15


In [8]:
# create an alias, because this is more handy for the later use
df_s = df_songs_table.alias('df_s')
df_l = df_logs_table.alias('df_l')

In [9]:
# filter data set to pass only "df_l.page == "NextSong" and then join both data sets together by "left join"
df_f_songplays_joined = df_l \
    .filter(df_l.page == "NextSong") \
    .join(df_s, [(df_l.artist == df_s.artist_name) & (df_l.song == df_s.title)], how='left') \
    .withColumnRenamed("ts_timestamp", "start_time")

In [10]:
from pyspark.sql.functions import year, month, hour, weekofyear, dayofweek, dayofmonth, date_format, to_timestamp, \
    row_number, monotonically_increasing_id

# extract columns from joined song and log datasets to create songplays table
# songplay_id consists of a window function which is sorted by monotonically_increasing_id()
df_f_songplays = df_f_songplays_joined \
    .withColumnRenamed("userid", "user_id") \
    .withColumnRenamed("sessionid", "session_id") \
    .withColumnRenamed("useragent", "user_agent") \
    .withColumn("year", year(df_f_songplays_joined.start_time)) \
    .withColumn("month", month(df_f_songplays_joined.start_time)) \
    .withColumn("songplay_id", row_number().over(Window.orderBy(monotonically_increasing_id()))) \
    .select("songplay_id", "user_id", "song_id", "artist_id", "start_time", "session_id", "level", "location",
            "user_agent", "year", "month") \
    .distinct()

In [11]:
df_f_songplays.printSchema()
df_f_songplays.show(10)

root
 |-- songplay_id: integer (nullable = true)
 |-- user_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- session_id: long (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+-----------+-------+-------+---------+-------------------+----------+-----+--------------------+--------------------+----+-----+
|songplay_id|user_id|song_id|artist_id|         start_time|session_id|level|            location|          user_agent|year|month|
+-----------+-------+-------+---------+-------------------+----------+-----+--------------------+--------------------+----+-----+
|          1|      8|   null|     null|2018-11-01 22:00:46|       139| free|Phoenix-Mesa-Scot...|"Mozilla/5.0 (Win...|2018|   11|
|          2|      8|   null|   

In [15]:
# check results (user: 15 = 463; 29 = 346)

In [12]:
df_f_songplays \
    .filter(df_f_songplays.user_id == 29) \
    .distinct() \
    .groupBy(df_f_songplays.user_id).count() \
    .show()

# check results (user: 15 = 429; 29 = 309)

+-------+-----+
|user_id|count|
+-------+-----+
+-------+-----+



In [13]:
df_f_songplays \
    .repartition(1) \
    .write \
    .mode(saveMode='Overwrite') \
    .partitionBy("year","month") \
    .parquet('s3a://project-4-data-lake/analytics/f_songplays/f_songplays.parquet')