In [63]:
import os
import glob

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark Local Analysis Development") \
    .getOrCreate()

In [3]:
spark.sparkContext.getConf().getAll()

[('spark.rdd.compress', 'True'),
 ('spark.driver.port', '50655'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.name', 'Spark Local Analysis Development'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', 'matts-mbp'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.id', 'local-1585456578232')]

In [4]:
spark

In [7]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

In [10]:
song_files = get_files('data/song_data')

In [36]:
log_files = get_files('data/log-data')

In [87]:
song_schema = StructType([StructField('num_songs', IntegerType(), False),
                     StructField('artist_id', StringType(), False),
                     StructField('artist_latitude', DoubleType(), False),
                     StructField('artist_longitude', DoubleType(), False),
                     StructField('artist_location', StringType(), False),
                     StructField('artist_name', StringType(),False),
                     StructField('song_id', StringType(), False),
                     StructField('title', StringType(), False),
                     StructField('duration', FloatType(), False),
                     StructField('year', IntegerType(), False)])

There is something wrong in the data in the log jsons that throw an error when trying to cast as a non-String type at import which causes the entire row to be nulled.  For this reason, we'll still keep consistant with song_data and define a schema but for log_schema all types will be String.

In [147]:
log_schema = StructType([StructField('artist', StringType(), True),
                     StructField('auth', StringType(), True),
                     StructField('firstName', StringType(), True),
                     StructField('gender', StringType(), True),
                     StructField('itemInSession', StringType(), True),
                     StructField('lastName', StringType(),True),
                     StructField('length', StringType(), True),
                     StructField('level', StringType(), True),
                     StructField('location', StringType(), True),
                     StructField('method', StringType(), True),
                     StructField('page', StringType(), True),
                     StructField('registration', StringType(), True),
                     StructField('sessionId', StringType(), True),
                     StructField('song', StringType(), True),
                     StructField('status', StringType(), True),
                     StructField('ts', StringType(), True),
                     StructField('userAgent', StringType(), True),
                     StructField('userId', StringType(), True)])

In [148]:
song_data = spark.read.schema(song_schema).option("multiline","true").json(song_files)

In [149]:
log_data = spark.read.schema(log_schema).option("multiline","true").json(log_files)

In [152]:
song_data.printSchema()

root
 |-- num_songs: integer (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- year: integer (nullable = true)



Despite setting nullable as false in the schema, when imported the schema of song_data shows all are nullable?!?

In [153]:
log_data.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: string (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: string (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- song: string (nullable = true)
 |-- status: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



#### Data Cleaning

From previous inspection of this data, we know there are multiple null entries as well as invalid year entries (reads as 0)

In [157]:
song_data_clean = song_data.na.drop()

In [161]:
log_data_clean = log_data.na.drop()

In [165]:
song_data_clean = song_data_clean.filter(song_data_clean.year != "0")

+---------+------------------+---------------+----------------+--------------------+--------------------+------------------+--------------------+---------+----+
|num_songs|         artist_id|artist_latitude|artist_longitude|     artist_location|         artist_name|           song_id|               title| duration|year|
+---------+------------------+---------------+----------------+--------------------+--------------------+------------------+--------------------+---------+----+
|        1|ARMAC4T1187FB3FA4C|       40.82624|       -74.47995|   Morris Plains, NJ|The Dillinger Esc...|SOBBUGU12A8C13E95D|Setting Fire to S...|207.77751|2004|
|        1|ARPBNLO1187FB3D52F|       40.71455|       -74.00712|        New York, NY|            Tiny Tim|SOAOIBZ12AB01815BE|I Hold Your Hand ...| 43.36281|2000|
|        1|ARDNS031187B9924F0|       32.67828|       -83.22295|             Georgia|          Tim Wilson|SONYPOM12A8C13B2D7|I Think My Wife I...|186.48772|2005|
|        1|ARNF6401187FB57032|    

In [172]:
log_data_clean.show()

+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession| lastName|   length|level|            location|method|    page|     registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|            Harmonia|Logged In|     Ryan|     M|            0|    Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|       Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
|     The Grass Roots|Logged In|     Sara|     F|           72|  Johnson|166.71302| paid|   Winston-

In [170]:
song_data_clean.show()

+---------+------------------+---------------+----------------+--------------------+--------------------+------------------+--------------------+---------+----+
|num_songs|         artist_id|artist_latitude|artist_longitude|     artist_location|         artist_name|           song_id|               title| duration|year|
+---------+------------------+---------------+----------------+--------------------+--------------------+------------------+--------------------+---------+----+
|        1|ARMAC4T1187FB3FA4C|       40.82624|       -74.47995|   Morris Plains, NJ|The Dillinger Esc...|SOBBUGU12A8C13E95D|Setting Fire to S...|207.77751|2004|
|        1|ARPBNLO1187FB3D52F|       40.71455|       -74.00712|        New York, NY|            Tiny Tim|SOAOIBZ12AB01815BE|I Hold Your Hand ...| 43.36281|2000|
|        1|ARDNS031187B9924F0|       32.67828|       -83.22295|             Georgia|          Tim Wilson|SONYPOM12A8C13B2D7|I Think My Wife I...|186.48772|2005|
|        1|ARNF6401187FB57032|    