In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

In [5]:
song_data = "data/song_data/A/*/*/*.json"

In [6]:
df = spark.read.format("json").json(song_data)

In [7]:
df.printSchema()
df.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)

+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|     artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|ARDR4AC1187FB371A1|           null|                    |            null|Montserrat Caball...|5

In [32]:
songs_table = df.select(["song_id","artist_id","title","duration","year"]).limit(5).show(5)

+------------------+------------------+--------------------+---------+----+
|           song_id|         artist_id|               title| duration|year|
+------------------+------------------+--------------------+---------+----+
|SOBAYLL12A8C138AF9|ARDR4AC1187FB371A1|Sono andati? Fing...|511.16363|   0|
|SOOLYAZ12A6701F4A6|AREBBGV1187FB523D2|Laws Patrolling (...|173.66159|   0|
|SOBBUGU12A8C13E95D|ARMAC4T1187FB3FA4C|Setting Fire to S...|207.77751|2004|
|SOAOIBZ12AB01815BE|ARPBNLO1187FB3D52F|I Hold Your Hand ...| 43.36281|2000|
|SONWXQJ12A8C134D94|ARNF6401187FB57032|The Ballad Of Sle...|  305.162|1994|
+------------------+------------------+--------------------+---------+----+



In [33]:
exprs=["artist_id","artist_name as name","artist_location as location","artist_latitude as latitude","artist_longitude as longitude"]
artists_table = df.selectExpr(*exprs).limit(5).show(5)

+------------------+--------------------+--------------------+--------+---------+
|         artist_id|                name|            location|latitude|longitude|
+------------------+--------------------+--------------------+--------+---------+
|ARDR4AC1187FB371A1|Montserrat Caball...|                    |    null|     null|
|AREBBGV1187FB523D2|Mike Jones (Featu...|         Houston, TX|    null|     null|
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|   Morris Plains, NJ|40.82624|-74.47995|
|ARPBNLO1187FB3D52F|            Tiny Tim|        New York, NY|40.71455|-74.00712|
|ARNF6401187FB57032|   Sophie B. Hawkins|New York, NY [Man...|40.79086|-73.96644|
+------------------+--------------------+--------------------+--------+---------+



In [8]:
songs_table = df.select(["song_id","artist_id","title","duration","year"])

In [9]:
from pyspark import SparkContext as sc
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
#sc.hadoopConfiguration.set("fs.s3a.endpoint", "s3-eu-central-1.amazonaws.com")

In [None]:
songs_table.write.parquet("s3a://udacity-de/songs/songs.parquet")

In [11]:
songs_table.count()

71

In [12]:
songs_table.drop_duplicates().count()

71