In [1]:
"""
This file in only for developing purposes, because it's not needed to create a Spark Session each time
"""
import configparser
import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType
from pyspark.sql.functions import to_timestamp, date_format, hour, weekofyear, month, year, dayofweek, dayofmonth
import pyspark.sql.functions as F

In [2]:
"""
Set global variables to access S3 via OS environment variables
"""
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
"""
Create Spark Session and setting LOG-Level
Attributes:
    ERROR   - less details - only when something went wrong
    WARN    - more details
    INFO    - more details than Warn level
    DEBUG   - very detailed information
"""
spark = SparkSession \
    .builder \
    .appName("etl pipeline for project 4 - Data Lake") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

# setting the current LOG-Level
spark.sparkContext.setLogLevel('ERROR')

In [8]:
# get filepath to song data file
log_data = 's3a://udacity-dend/log_data/2018/11/2018-11-01-events.json'

print(log_data)

s3a://udacity-dend/log_data/2018/11/2018-11-01-events.json


In [9]:
from pyspark.sql.types import StructType as R \
    , StructField as Fld \
    , DoubleType as Dbl \
    , StringType as Str \
    , IntegerType as Int \
    , LongType as Lng \
    , DateType as Date

schema_log = R([
    Fld("artist", Str()),
    Fld("auth", Str()),
    Fld("firstName", Str()),
    Fld("gender", Str()),
    Fld("itemInSession", Lng()),
    Fld("lastName", Str()),
    Fld("length", Dbl()),
    Fld("level", Str()),
    Fld("location", Str()),
    Fld("method", Str()),
    Fld("page", Str()),
    Fld("registration", Dbl()),
    Fld("sessionId", Lng()),
    Fld("song", Str()),
    Fld("status", Lng()),
    Fld("ts", Lng()),
    Fld("userAgent", Str()),
    Fld("userId", Str())
])

In [10]:
"""
Read data with "new" schema.
"""
# df_log_table = spark.read.schema(schema_log).json(log_data)
df_log_table = spark.read.schema(schema_log).json(log_data)



In [11]:
"""
Check the new structure and it's data in it
"""
print(df_log_table.count())
df_log_table.printSchema()
df_log_table.show(5, truncate=False)
df_log_table.limit(5).toPandas()


15
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

+-------+---------+---------+------+-------------+--------+---------+-----+---------------------------------+------+--------+-----------------+---------+------------+------+-------------+-----------------------------------------------------------------------------------------------------------------

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1540919000000.0,38,,200,1541105830796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",39
1,,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1540345000000.0,139,,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
2,Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,You Gotta Be,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
3,,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1540345000000.0,139,,200,1541106132796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
4,Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,Flat 55,200,1541106352796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8


In [12]:
# set valid timestamp format
ts_format = "yyyy-MM-dd HH:MM:ss z"
weekday_format = "EEEE"

# Converting ts to a timestamp format
"""
- "withColumn" -- add additional column to df
- "df_log_table.ts / 1000)" -- cut off milliseconds
- ".cast(dataType=TimestampType())" -- ts-value (reduced by milliseconds) is casted as TimestampType
- "to_timestamp("value in Timestampformat", tsFormat)" -- format timestamp into the correct format which is given above
"""
df_log_table = df_log_table\
    .withColumn('ts_timestamp', to_timestamp( date_format( (df_log_table.ts / 1000).cast(dataType=TimestampType()), ts_format)
                                              ,ts_format))

In [13]:
df_log_table.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- ts_timestamp: timestamp (nullable = true)



In [14]:
df_log_table.show(2)

+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+-------------------+
|artist|     auth|firstName|gender|itemInSession|lastName|length|level|            location|method|page|     registration|sessionId|song|status|           ts|           userAgent|userId|       ts_timestamp|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+-------------------+
|  null|Logged In|   Walter|     M|            0|    Frye|  null| free|San Francisco-Oak...|   GET|Home|1.540919166796E12|       38|null|   200|1541105830796|"Mozilla/5.0 (Mac...|    39|2018-11-01 21:00:10|
|  null|Logged In|   Kaylee|     F|            0| Summers|  null| free|Phoenix-Mesa-Scot...|   GET|Home|1.540344794796E12|      139|null|   200|1541106106796|"Mozilla/5.0 (

In [15]:
# create time table
"""
What's going on here?
- get only the valid timestamp
- drop duplicate entries. Within the dimension table d_time no duplicates are allowed
- extract the following things from timestamp: hour, day, week, month, weekday, day_of_week (name of the day)
- rename colums ts_timestamp, because it's not needed anymore
"""
from pyspark.sql.functions import to_timestamp, date_format, hour, weekofyear, month, year, dayofweek, dayofmonth

df_d_time = df_log_table\
    .select(df_log_table.ts_timestamp) \
    .dropDuplicates() \
    .withColumn("hour", hour(df_log_table.ts_timestamp)) \
    .withColumn("day", dayofmonth(df_log_table.ts_timestamp)) \
    .withColumn("week", weekofyear(df_log_table.ts_timestamp)) \
    .withColumn("month", month(df_log_table.ts_timestamp)) \
    .withColumn("year", year(df_log_table.ts_timestamp)) \
    .withColumn("weekday", dayofweek(df_log_table.ts_timestamp)) \
    .withColumn("day_of_week", date_format( df_log_table.ts_timestamp, weekday_format) )  \
    .withColumnRenamed("ts_timestamp", "start_time")

print(df_d_time.count())

df_d_time.limit(3).show()
df_d_time.printSchema()

14
+-------------------+----+---+----+-----+----+-------+-----------+
|         start_time|hour|day|week|month|year|weekday|day_of_week|
+-------------------+----+---+----+-----+----+-------+-----------+
|2018-11-01 22:00:15|  22|  1|  44|   11|2018|      5|   Thursday|
|2018-11-01 22:00:13|  22|  1|  44|   11|2018|      5|   Thursday|
|2018-11-01 22:00:33|  22|  1|  44|   11|2018|      5|   Thursday|
+-------------------+----+---+----+-----+----+-------+-----------+

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- day_of_week: string (nullable = true)



In [16]:
"""
Write data frame as JSON or PARQUET back to file system
Data to write: df_d_time
"""
df_d_time \
    .repartition(1) \
    .write \
    .mode(saveMode='Overwrite') \
    .partitionBy("year","month") \
    .parquet('s3a://project-4-data-lake/analytics/d_time/d_time.parquet')


root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- ts_timestamp: timestamp (nullable = true)



In [17]:
# handle table d_users
print(df_log_table.count())

# renamed columns for later use of dimension table d_users
df_d_users_prep = df_log_table \
    .withColumnRenamed("userId", "user_id") \
    .withColumnRenamed("firstName", "first_name") \
    .withColumnRenamed("lastName", "last_name") \
    .select("user_id", "first_name", "last_name", "gender", "level", "ts_timestamp") \
    .dropDuplicates() \
    .sort("last_name", "first_Name",  ascending=True )

#df_log_table.limit(5).toPandas()
print(df_d_users_prep.count())
df_d_users_prep.printSchema()
df_d_users_prep.show(20)

15
14
root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)
 |-- ts_timestamp: timestamp (nullable = true)

+-------+----------+---------+------+-----+-------------------+
|user_id|first_name|last_name|gender|level|       ts_timestamp|
+-------+----------+---------+------+-----+-------------------+
|     10|    Sylvie|     Cruz|     F| free|2018-11-01 22:00:00|
|    101|    Jayden|      Fox|     M| free|2018-11-01 23:00:14|
|     39|    Walter|     Frye|     M| free|2018-11-01 21:00:10|
|     26|      Ryan|    Smith|     M| free|2018-11-01 22:00:05|
|     26|      Ryan|    Smith|     M| free|2018-11-01 22:00:15|
|     26|      Ryan|    Smith|     M| free|2018-11-01 22:00:25|
|      8|    Kaylee|  Summers|     F| free|2018-11-01 22:00:13|
|      8|    Kaylee|  Summers|     F| free|2018-11-01 22:00:12|
|      8|    Kaylee|  Summers|     F| fr

In [18]:
# get only last entry for each user
df_d_users_prep.createOrReplaceTempView("users_temp_table")

df_d_users = spark.sql("""
    select ou.user_id, ou.first_name, ou.last_name, ou.gender, ou.level
      from users_temp_table as ou
      join (
            select iu.user_id, max(iu.ts_timestamp) as max_timestamp
              from users_temp_table iu
          group BY iu.user_id
            ) as iu on ou.user_id = iu.user_id and ou.ts_timestamp = iu.max_timestamp
    order by ou.user_id
   ;
   """)

In [19]:
df_d_users.printSchema()
df_d_users.sort("last_name").show()

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     10|    Sylvie|     Cruz|     F| free|
|    101|    Jayden|      Fox|     M| free|
|     39|    Walter|     Frye|     M| free|
|     26|      Ryan|    Smith|     M| free|
|      8|    Kaylee|  Summers|     F| free|
+-------+----------+---------+------+-----+



In [20]:
"""
Write data frame as JSON or PARQUET back to file system
Data to write: df_d_users
"""
df_d_users \
    .repartition(1) \
    .write \
    .mode(saveMode='Overwrite') \
    .parquet('s3a://project-4-data-lake/analytics/d_users/d_users.parquet')

In [21]:
# write df_log_table to create f_songplays fact table

df_log_table.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- ts_timestamp: timestamp (nullable = true)



In [22]:
"""
Write data frame "df_log_table" as PARQUET file to file system for later use in file
"Project4-DataLake_JupyterNB_create_f_songplays_table.ipynb". This is only temp data!!

Data to write: df_songWithSchema
- possible write modes: "Overwrite" and "Append"
"""
df_log_table = df_log_table\
    .withColumnRenamed("ts_timestamp", "start_time")

df_log_table.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)



In [23]:
df_log_table \
    .repartition(1) \
    .write \
    .mode(saveMode='Overwrite') \
    .parquet('s3a://project-4-data-lake/analytics/staging/df_logs_table.parquet')

In [24]:
print(df_log_table.count())
df_log_table.filter(df_log_table.page == "NextSong").show(2, truncate=False)
print(df_log_table.count())

15
+-------+---------+---------+------+-------------+--------+---------+-----+---------------------------+------+--------+-----------------+---------+------------+------+-------------+---------------------------------------------------------------------------------------------------------------+------+-------------------+
|artist |auth     |firstName|gender|itemInSession|lastName|length   |level|location                   |method|page    |registration     |sessionId|song        |status|ts           |userAgent                                                                                                      |userId|start_time         |
+-------+---------+---------+------+-------------+--------+---------+-----+---------------------------+------+--------+-----------------+---------+------------+------+-------------+---------------------------------------------------------------------------------------------------------------+------+-------------------+
|Des'ree|Logged In|Kaylee   |F    