In [1]:
import configparser
from datetime import datetime
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, to_timestamp
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, dayofweek, date_format, monotonically_increasing_id
from pyspark.sql.types import TimestampType

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config.get('CREDENTIALS','AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY']=config.get('CREDENTIALS','AWS_SECRET_ACCESS_KEY')

In [3]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [4]:
spark

In [5]:
input_data = "data/"
output_data = "data/output/"
output_data = "s3a://udacity-dend-p04/"
#s3a://udacity-dend/song_data/A/A/A/TRAAAAK128F9318786.json

In [6]:
song_data = os.path.join(input_data,"song_data/*/*/*/*.json")

In [7]:
df = spark.read.json(song_data)

In [8]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [9]:
df.count()

71

In [10]:
df.show(n=3)

+------------------+---------------+-----------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|  artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+-----------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|ARDR4AC1187FB371A1|           null|                 |            null|Montserrat Caball...|511.16363|        1|SOBAYLL12A8C138AF9|Sono andati? Fing...|   0|
|AREBBGV1187FB523D2|           null|      Houston, TX|            null|Mike Jones (Featu...|173.66159|        1|SOOLYAZ12A6701F4A6|Laws Patrolling (...|   0|
|ARMAC4T1187FB3FA4C|       40.82624|Morris Plains, NJ|       -74.47995|The Dillinger Esc...|207.77751|        1|SOBBUGU12A8C13E95D|Setting Fire to S...|2004|
+------------------+---------------+----------------

In [11]:
songs_table = df.select(["song_id", "title", "artist_id", "year", "duration"])

In [12]:
songs_table.show(2)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOBAYLL12A8C138AF9|Sono andati? Fing...|ARDR4AC1187FB371A1|   0|511.16363|
|SOOLYAZ12A6701F4A6|Laws Patrolling (...|AREBBGV1187FB523D2|   0|173.66159|
+------------------+--------------------+------------------+----+---------+
only showing top 2 rows



In [13]:
# write songs table to parquet files partitioned by year and artist
# os.path.join(output_data,"songs")
#"data/output/songs"
songs_table.write.partitionBy("year", "artist_id").parquet(os.path.join(output_data,"songs"), mode="overwrite")

In [36]:
# extract columns to create artists table
artists_table = df.select(["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"])

In [37]:
artists_table.show(n=3)

+------------------+--------------------+-----------------+---------------+----------------+
|         artist_id|         artist_name|  artist_location|artist_latitude|artist_longitude|
+------------------+--------------------+-----------------+---------------+----------------+
|ARDR4AC1187FB371A1|Montserrat Caball...|                 |           null|            null|
|AREBBGV1187FB523D2|Mike Jones (Featu...|      Houston, TX|           null|            null|
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|Morris Plains, NJ|       40.82624|       -74.47995|
+------------------+--------------------+-----------------+---------------+----------------+
only showing top 3 rows



In [38]:
artists_table = artists_table.withColumnRenamed("artist_name", "name")\
            .withColumnRenamed("artist_location", "location")\
            .withColumnRenamed("artist_latitude", "latitude")\
            .withColumnRenamed("artist_longitude", "longitude")


In [39]:
artists_table.show(n=3)

+------------------+--------------------+-----------------+--------+---------+
|         artist_id|                name|         location|latitude|longitude|
+------------------+--------------------+-----------------+--------+---------+
|ARDR4AC1187FB371A1|Montserrat Caball...|                 |    null|     null|
|AREBBGV1187FB523D2|Mike Jones (Featu...|      Houston, TX|    null|     null|
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|Morris Plains, NJ|40.82624|-74.47995|
+------------------+--------------------+-----------------+--------+---------+
only showing top 3 rows



In [41]:
# write artists table to parquet files
artists_table.write.parquet("data/output/artists", mode="overwrite")

# Log data

In [129]:
log_data = os.path.join(input_data,"log_data/*/*/*.json")

In [130]:
# read log data file
df = spark.read.json(log_data)

In [131]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [132]:
# filter by actions for song plays
df = df.filter(df.page == 'NextSong').dropDuplicates()

In [133]:
# extract columns for users table    
users_table = df.select(["userId", "firstName", "lastName", "gender", "level"])\
                .withColumnRenamed("userId", "user_id") \
                .withColumnRenamed("firstName", "first_name") \
                .withColumnRenamed("lastName", "last_name") \
                .dropDuplicates()

In [134]:
users_table.show(n=5)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     26|      Ryan|    Smith|     M| free|
|      7|    Adelyn|   Jordan|     F| free|
|     71|    Ayleen|     Wise|     F| free|
|     81|    Sienna|    Colon|     F| free|
|     87|    Dustin|      Lee|     M| free|
+-------+----------+---------+------+-----+
only showing top 5 rows



In [135]:
# write users table to parquet files
users_table.write.parquet("data/output/users", mode="overwrite")

## time table

In [136]:
# create timestamp column from original timestamp column
get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType())
df = df.withColumn("hour", hour(get_timestamp(df.ts))) \
        .withColumn("day", dayofmonth(get_timestamp(df.ts))) \
        .withColumn("week", weekofyear(get_timestamp(df.ts))) \
        .withColumn("month", month(get_timestamp(df.ts))) \
        .withColumn("year", year(get_timestamp(df.ts))) \
        .withColumn("weekday", dayofweek(get_timestamp(df.ts))) \

In [137]:
# start_time, hour, day, week, month, year, weekday
# year, month, dayofmonth, hour, weekofyear, date_format

In [138]:
df.head()

Row(artist='Fat Joe', auth='Logged In', firstName='Kate', gender='F', itemInSession=21, lastName='Harrell', length=241.34485, level='paid', location='Lansing-East Lansing, MI', method='PUT', page='NextSong', registration=1540472624796.0, sessionId=605, song='Safe 2 Say [The Incredible] (Album Version - Amended)', status=200, ts=1542296032796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"', userId='97', hour=15, day=15, week=46, month=11, year=2018, weekday=5)

In [139]:
# extract columns to create time table
time_table = df.select(["ts", "hour", "day", "week", "month", "year", "weekday"]).withColumnRenamed("ts", "start_time")

In [140]:
time_table.show(n=5)

+-------------+----+---+----+-----+----+-------+
|   start_time|hour|day|week|month|year|weekday|
+-------------+----+---+----+-----+----+-------+
|1542296032796|  15| 15|  46|   11|2018|      5|
|1542299023796|  16| 15|  46|   11|2018|      5|
|1542318319796|  21| 15|  46|   11|2018|      5|
|1542321121796|  22| 15|  46|   11|2018|      5|
|1542786093796|   7| 21|  47|   11|2018|      4|
+-------------+----+---+----+-----+----+-------+
only showing top 5 rows



In [141]:
# write time table to parquet files partitioned by year and month
time_table.write.partitionBy("year", "month").parquet("data/output/time", mode="overwrite")

## songplays

In [164]:
# create songplay_id
df = df.withColumn('songplay_id', monotonically_increasing_id())
df.createOrReplaceTempView("log_table")

In [165]:
# read in song data to use for songplays table
song_data = os.path.join(input_data,"song_data/*/*/*/*.json")
song_df = spark.read.json(song_data)
song_df.createOrReplaceTempView("songs_table")

In [166]:
song_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [167]:
songs_table = spark.sql(""" 
SELECT *
FROM songs_table
LIMIT 5
 """)

In [168]:
songs_table.show()

+------------------+---------------+-----------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|  artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+-----------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|ARDR4AC1187FB371A1|           null|                 |            null|Montserrat Caball...|511.16363|        1|SOBAYLL12A8C138AF9|Sono andati? Fing...|   0|
|AREBBGV1187FB523D2|           null|      Houston, TX|            null|Mike Jones (Featu...|173.66159|        1|SOOLYAZ12A6701F4A6|Laws Patrolling (...|   0|
|ARMAC4T1187FB3FA4C|       40.82624|Morris Plains, NJ|       -74.47995|The Dillinger Esc...|207.77751|        1|SOBBUGU12A8C13E95D|Setting Fire to S...|2004|
|ARPBNLO1187FB3D52F|       40.71455|     New York, N

In [148]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- songplay_id: long (nullable = false)



In [172]:
 # extract columns from joined song and log datasets to create songplays table 
    # songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agen
    
songplays_table = spark.sql("""
SELECT  log_table.songplay_id,
        log_table.ts AS start_time,
        log_table.userId AS user_id,
        log_table.level,
        songs_table.song_id AS song_id,
        songs_table.artist_id AS artist_id,
        log_table.sessionId AS session_id,
        log_table.location,
        log_table.userAgent AS user_agen,
        log_table.year,
        log_table.month
FROM   log_table INNER JOIN songs_table ON log_table.song = songs_table.title AND log_table.artist = songs_table.artist_name AND log_table.length = songs_table.duration
""")

In [173]:
songplays_table.show()

+------------+-------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
| songplay_id|   start_time|user_id|level|           song_id|         artist_id|session_id|            location|           user_agen|year|month|
+------------+-------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
|188978561035|1542837407796|     15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|       818|Chicago-Napervill...|"Mozilla/5.0 (X11...|2018|   11|
+------------+-------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+



In [174]:
# write songplays table to parquet files partitioned by year and month
songplays_table.write.partitionBy("year", "month").parquet("data/output/songplays", mode="overwrite")

In [None]:
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = os.path.join(input_data,"song_data/*/*/*/*.json")
    
    # read song data file
    df = spark.read.json(song_data)

    # extract columns to create songs table
    songs_table = 
    
    # write songs table to parquet files partitioned by year and artist
    songs_table

    # extract columns to create artists table
    artists_table = 
    
    # write artists table to parquet files
    artists_table

In [2]:
input_data = "s3a://udacity-dend/"

In [3]:
a = os.path.join(input_data,"song_data/*/*/*/*.json")

In [4]:
a

's3a://udacity-dend/song_data/*/*/*/*.json'

In [67]:
#         .config("fs.s3a.awsAccessKeyId", "dddd") \
#         .config("fs.s3a.awsSecretAccessKey", "ddd") \

In [60]:
#SparkContext(conf=spark.config())

In [61]:
dfAWS = spark.read.json( "s3a://udacity-dend/song_data/A/A/A/TRAAAAK128F9318786.json")

Py4JJavaError: An error occurred while calling o651.json.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 403, AWS Service: Amazon S3, AWS Request ID: 0836F81CCB0FDD58, AWS Error Code: null, AWS Error Message: Forbidden, S3 Extended Request ID: z5paoIyz2v3jGU2JI2Dezqux/SsG7ijB8SFOeVQJYKgByo2Dkg4hgldF6k8bBuJCLmHqGWuOEB4=
	at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:798)
	at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:421)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:232)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3528)
	at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:976)
	at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:956)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:892)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:77)
	at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1426)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:557)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.immutable.List.flatMap(List.scala:355)
	at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:391)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [14]:
df = spark.read.json("data/song_data/*/*/*/*.json")

In [16]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [17]:
df.write.save("s3://udacity-dend-p04/tes", format = "csv", header = True)

Py4JJavaError: An error occurred while calling o224.save.
: java.io.IOException: No FileSystem for scheme: s3
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWritingFileFormat(DataSource.scala:424)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWriting(DataSource.scala:524)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:290)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [9]:
dfAWS = spark.read.json("s3://ryft-public-sample-data/integration_test_dataset_large.json")
#"s3://ryft-public-sample-data/integration_test_dataset_large.json"
#"s3a://udacity-dend/song_data"

Py4JJavaError: An error occurred while calling o179.json.
: java.io.IOException: No FileSystem for scheme: s3
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.immutable.List.flatMap(List.scala:355)
	at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:391)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
# df = spark.read.json("data/song_data/A/A/A/*.json")

In [22]:
df = spark.read.json("data/song_data/*/*/*/*.json")

In [23]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [24]:
df.count()

71

In [41]:
input_data = "s3://udacity-dend/song_data"

In [42]:
song_data = "/*/*/*/*.json"

In [43]:
print(input_data + song_data)

s3://udacity-dend/song_data/*/*/*/*.json


In [44]:
df = spark.read.json("data/TRAAAAW128F429D538.json")

In [45]:
#df = spark.read.json("data/song-data.zip")
#"https://udacity-dend-p04.s3-us-west-2.amazonaws.com/songs_events.json"
#"s3://udacity-dend/song_data/A/A/A/TRAAAAK128F9318786.json"

In [None]:
df.printSchema