In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
import logging
logging.basicConfig(
    format='[ %(asctime)s ] %(filename)s(%(lineno)d) %(levelname)s - %(message)s',
    level=logging.INFO
)

config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .master("local[4]") \
        .appName("sparkify-ETL")  \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .config("spark.sql.autoBroadcastJoinThreshold", -1) \
        .getOrCreate()
    return spark


In [47]:
spark = create_spark_session()
#input_data = "s3a://udacity-dend/"
# output_data = "s3a://udacity-lesson3-project-bucket/"
input_data = "./data/"
output_data = "./output_data/"

In [5]:
spark

In [6]:
! unzip -o -q data/song-data.zip -d data
! unzip -o -q data/log-data.zip -d data/log_data


In [20]:
! mkdir -p ./output_data

In [21]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType

In [27]:
from etl import process_song_data, process_log_data

In [43]:
process_song_data(spark, input_data, output_data)

[ 2020-06-15 20:12:17,106 ] etl.py(35) INFO - Default Parallelism is 4
[ 2020-06-15 20:12:17,110 ] etl.py(51) INFO - Reading raw Song data from source: ./data/song_data/*/*/*/*.json
[ 2020-06-15 20:12:17,628 ] etl.py(56) INFO - Raw Song Data has 4 partitions
[ 2020-06-15 20:12:17,812 ] etl.py(57) INFO - Partition size is [18, 18, 18, 17]
[ 2020-06-15 20:12:17,815 ] etl.py(59) INFO - Extracting the song dimension table
[ 2020-06-15 20:12:17,923 ] etl.py(64) INFO - Song dimension table has 4 memory partitions
[ 2020-06-15 20:12:17,926 ] etl.py(66) INFO - Writing back the song dimension table partitioned by year and artist_id
[ 2020-06-15 20:12:18,773 ] etl.py(77) INFO - Extracting the artist dimension table
[ 2020-06-15 20:12:18,862 ] etl.py(89) INFO - Artist dimension table has 200 memory partitions
[ 2020-06-15 20:12:23,118 ] etl.py(90) INFO - Artist dimension table partition size is [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 

In [44]:
process_log_data(spark, input_data, output_data)

[ 2020-06-15 20:12:43,632 ] etl.py(121) INFO - Default Parallelism is 4
[ 2020-06-15 20:12:43,636 ] etl.py(122) INFO - Reading Log data from source
[ 2020-06-15 20:12:43,942 ] etl.py(127) INFO - Raw Log Data has 4 partitions
[ 2020-06-15 20:12:44,409 ] etl.py(128) INFO - Partition size is [3512, 2585, 1485, 474]
[ 2020-06-15 20:12:44,518 ] etl.py(133) INFO - Extracting the users dimension table
[ 2020-06-15 20:12:44,617 ] etl.py(144) INFO - Users dimension table has 200 memory partitions
[ 2020-06-15 20:12:49,402 ] etl.py(145) INFO - Users dimension table partition size is [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 1, 2, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 1, 3, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 2,