In [1]:
"""
This file in only for developing purposes, because it's not needed to create a Spark Session each time
"""
import configparser
import datetime
import os
from pyspark.sql import SparkSession

In [2]:
"""
Set global variables to access S3 via OS environment variables
"""
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
"""
Create Spark Session and setting LOG-Level
Attributes:
    ERROR   - less details - only when something went wrong
    WARN    - more details
    INFO    - more details than Warn level
    DEBUG   - very detailed information
"""
spark = SparkSession \
    .builder \
    .appName("etl pipeline for project 4 - Data Lake") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

# setting the current LOG-Level
spark.sparkContext.setLogLevel('ERROR')

In [4]:
"""
Setting up the path variables
- Storage on S3
"""
# get filepath to song data file
# song_data = 's3a://udacity-dend/song_data/A/A/A/*.json'
song_data = 's3a://udacity-dend/song_data/A/A/A/TRAAAAK128F9318786.json'

print(song_data)

s3a://udacity-dend/song_data/A/A/A/TRAAAAK128F9318786.json


In [5]:
"""
Specification of a StructType for song_data to avoid misinterpretations. For performance reasons this specification is
also useful.

Personal Info: StructType (Lesson 5 - 6/51) and look at example code (L5 - 26/51 and 28/51)
"""

from pyspark.sql.types import StructType as R\
    , StructField as Fld\
    , DoubleType as Dbl\
    , StringType as Str\
    , IntegerType as Int\
    , DateType as Date

schema_songs = R([
    Fld("artist_id", Str()),
    Fld("artist_latitude", Dbl()),
    Fld("artist_location", Str()),
    Fld("artist_longitude", Dbl()),
    Fld("artist_name", Str()),
    Fld("duration", Dbl()),
    Fld("num_songs", Int()),
    Fld("song_id", Str()),
    Fld("title", Str()),
    Fld("year", Int())
])


In [6]:
"""
Read data with "new" schema.
"""
df_songWithSchema = spark.read.schema(schema_songs).json(song_data)


In [7]:
"""
Check the new structure and it's data in it
"""
print(df_songWithSchema.count())
df_songWithSchema.printSchema()
df_songWithSchema.show(5, truncate=False)
df_songWithSchema.limit(5).toPandas()


1
root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: integer (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)

+------------------+---------------+---------------+----------------+------------+--------+---------+------------------+------+----+
|artist_id         |artist_latitude|artist_location|artist_longitude|artist_name |duration|num_songs|song_id           |title |year|
+------------------+---------------+---------------+----------------+------------+--------+---------+------------------+------+----+
|ARJNIUY12298900C91|null           |               |null            |Adelitas Way|213.9424|1        |SOBLFFE12AF72AA5BA|Scream|2009|
+------------------+---------------+---

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARJNIUY12298900C91,,,,Adelitas Way,213.9424,1,SOBLFFE12AF72AA5BA,Scream,2009


In [8]:
""""
Reduce duplicates and extract columns to create d_songs and d_artistis table
"""
print("Amount of lines before: " + str(df_songWithSchema.count()) )

df_songs_table = df_songWithSchema \
    .select("song_id", "artist_id", "title", "year", "duration") \
    .dropDuplicates() \
    .sort("duration")

print("Amount of lines after: " + str(df_songWithSchema.count()) )

df_songs_table.limit(5).toPandas()

Amount of lines before: 1
Amount of lines after: 1


Unnamed: 0,song_id,artist_id,title,year,duration
0,SOBLFFE12AF72AA5BA,ARJNIUY12298900C91,Scream,2009,213.9424


In [11]:
"""
Write data frame as JSON or PARQUET back to file system
Data to write: d_songs
.repartition(1) --> repartition to worker nodes (in a cluster environment you have probably more worker nodes available)
"""
df_songs_table \
    .repartition(1) \
    .write \
    .mode(saveMode='Overwrite') \
    .partitionBy("year","artist_id") \
    .parquet('s3a://project-4-data-lake/analytics/d_songs/d_songs.parquet')


In [12]:
df_artistis_table = df_songWithSchema \
    .withColumnRenamed("artist_name", "name") \
    .withColumnRenamed("artist_location", "location") \
    .withColumnRenamed("artist_latitude", "latitude") \
    .withColumnRenamed("artist_longitude", "longitude") \
    .select("artist_id", "name", "location", "latitude", "longitude")

df_artistis_table.limit(5).toPandas()

Unnamed: 0,artist_id,name,location,latitude,longitude
0,ARJNIUY12298900C91,Adelitas Way,,,


In [13]:
"""
Write data frame as JSON or PARQUET back to file system
Data to write: d_artists
- possible write modes: "Overwrite" and "Append"
"""
df_artistis_table \
    .repartition(1) \
    .write \
    .mode(saveMode='Overwrite') \
    .parquet('s3a://project-4-data-lake/analytics/d_artists/d_artists.parquet')




In [14]:
"""
Write data frame "df_songWithSchema" as PARQUET file to file system for later use in file
"Project4-DataLake_JupyterNB_create_f_songplays_table.ipynb". This is only temp data!!

Data to write: df_songWithSchema
- possible write modes: "Overwrite" and "Append"
"""
df_songWithSchema \
    .repartition(1) \
    .write \
    .mode(saveMode='Overwrite') \
    .parquet('s3a://project-4-data-lake/analytics/staging/df_songs_table.parquet')



"""
Spark Read and Write Apache Parquet file
https://sparkbyexamples.com/spark/spark-read-write-dataframe-parquet-example/
"""

'\nSpark Read and Write Apache Parquet file\nhttps://sparkbyexamples.com/spark/spark-read-write-dataframe-parquet-example/\n'