In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, monotonically_increasing_id, dayofweek

In [3]:
def create_spark_session():
    """
    Create a Spark session with AWS Support.
    
    Args:
        None
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [4]:
def process_song_data(spark):
    """
    Load the data from song-data.zip to create the songs and artists tables
    to the star schema. Also, We used the spark functions to obtain the columns
    required. This data will be write and load in a S3 Bucket in parquet format.
    
    Args:
        spark: Instantiation of spark session.
        input_data = Path to the song-data s3 bucket.
        output_data = Path to store the parquet files.
    """
    # get filepath to song data file
    song_data = os.path.join('song_data/*/*/*/*.json')
    
    # read song data file
    df = spark.read.json(song_data)

    # extract columns to create songs table
    songs_table = df.select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet('song.parquet', partitionBy = ['year', 'artist_id'])

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude') \
                                  .withColumnRenamed('artist_name', 'name') \
                                  .withColumnRenamed('artist_location', 'location') \
                                  .withColumnRenamed('artist_latitude', 'latitude') \
                                  .withColumnRenamed('artist_longitude', 'longitude') \
                                 .dropDuplicates()
    artists_table.createOrReplaceTempView('artists')
    
    # write artists table to parquet files
    artists_table.write.parquet('artists.parquet', 'overwrite')

In [5]:
os.getcwd()

'/Users/michaelandr/Desktop/data-lake-project-resources'

In [6]:
def process_log_data(spark):
    """
    Load the data from log-data.zip to create the users,time and songplays tables
    to the star schema. Also, We used the spark functions to obtain the columns
    required. This data will be write and load in a S3 Bucket in parquet format.    
    
    Args:
        spark: Instantiation of spark session.
        input_data = Path to the song-data s3 bucket.
        output_data = Path to store the parquet files.    
    """
    # get filepath to log data file
    log_data = 'log-data/*.json'

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    actions_df = df.where(df.page == 'NextSong').select('ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent')

    # extract columns for users table    
    users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates()
    users_table.createOrReplaceTempView('users')
    
    # write users table to parquet files
    users_table.write.parquet('users/users.parquet', 'overwrite')
    
    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x)/1000)))
    actions_df = actions_df.withColumn('timestamp', get_timestamp(actions_df.ts))       
   
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000)))
    actions_df = actions_df.withColumn('datetime', get_datetime(actions_df.ts))
       
    # extract columns to create time table
    time_table = actions_df.select('datetime') \
                           .withColumn('start_time', actions_df.datetime) \
                           .withColumn('hour', hour('datetime')) \
                           .withColumn('day', dayofmonth('datetime')) \
                           .withColumn('week', weekofyear('datetime')) \
                           .withColumn('month', month('datetime')) \
                           .withColumn('year', year('datetime')) \
                           .withColumn('weekday', dayofweek('datetime')) \
                           .dropDuplicates()

        # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month') \
                    .parquet('time/time.parquet', 'overwrite')
    
    # read in song data to use for songplays table
    song_df = spark.read.json('song_data/*/*/*/*.json')

    # extract columns from joined song and log datasets to create songplays table 
    actions_df = actions_df.alias('log_df')
    song_df = song_df.alias('song_df')
    joined_df = actions_df.join(song_df, col('log_df.artist') == col('song_df.artist_name'), 'inner')
    songplays_table = joined_df.select(
        col('log_df.datetime').alias('start_time'),
        col('log_df.userId').alias('user_id'),
        col('log_df.level').alias('level'),
        col('song_df.song_id').alias('song_id'),
        col('song_df.artist_id').alias('artist_id'),
        col('log_df.sessionId').alias('session_id'),
        col('log_df.location').alias('location'), 
        col('log_df.userAgent').alias('user_agent'),
        year('log_df.datetime').alias('year'),
        month('log_df.datetime').alias('month')) \
        .withColumn('songplay_id', monotonically_increasing_id())

    songplays_table.createOrReplaceTempView('songplays')
    # write songplays table to parquet files partitioned by year and month
    time_table = time_table.alias('timetable')

    songplays_table.write.partitionBy('year', 'month').parquet( 'songplays/songplays.parquet', 'overwrite')





In [7]:
def main():
    spark = create_spark_session()
    process_song_data(spark)#, input_data, output_data)    
    process_log_data(spark)#, input_data, output_data)


if __name__ == "__main__":
    main()

:: loading settings :: url = jar:file:/Users/michaelandr/opt/anaconda3/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/michaelandr/.ivy2/cache
The jars for the packages stored in: /Users/michaelandr/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-67c03752-5d3f-479b-9712-f764ebed3778;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;2.7.0 in central
	found org.apache.hadoop#hadoop-common;2.7.0 in central
	found org.apache.hadoop#hadoop-annotations;2.7.0 in central
	found com.google.guava#guava;11.0.2 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found commons-cli#commons-cli;1.2 in central
	found org.apache.commons#commons-math3;3.1.1 in central
	found xmlenc#xmlenc;0.52 in central
	found commons-httpclient#commons-httpclient;3.1 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.4 in central
	found commons-io#commons-io;2.4 in central
	found commons-net#commons-net;3.1 in central
	found commons-collections#com

:: retrieving :: org.apache.spark#spark-submit-parent-67c03752-5d3f-479b-9712-f764ebed3778
	confs: [default]
	0 artifacts copied, 68 already retrieved (0kB/33ms)


22/09/18 14:55:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

In [8]:
#spark = create_spark_session()
# data =[("James ","","Smith","36636","M",3000),
#               ("Michael ","Rose","","40288","M",4000),
#               ("Robert ","","Williams","42114","M",4000),
#               ("Maria ","Anne","Jones","39192","F",4000),
#               ("Jen","Mary","Brown","","F",-1)]
# columns=["firstname","middlename","lastname","dob","gender","salary"]
#df = spark.read.json('log-data/*.json')

# df=spark.createDataFrame(data,columns)
# df.write.parquet("song.parquet")

In [9]:
#df.printSchema()
#df = df.where(df.page == 'NextSong').select('ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent')
#'ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent')
#users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates()
#users_table.createOrReplaceTempView('users')

In [10]:
df.show()

NameError: name 'df' is not defined