# Spark Data Exploration
Notebook to test data analysis and processes before transferring to Python script

In [None]:
import os
import glob

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg, regexp_replace, from_unixtime

In [None]:
spark = SparkSession \
    .builder \
    .appName("Spark Local Analysis Development") \
    .getOrCreate()

In [None]:
spark.sparkContext.getConf().getAll()

In [None]:
spark

In [None]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

In [None]:
song_files = get_files('data/song_data')
log_files = get_files('data/log-data')

Define JSON import schemas

In [None]:
song_schema = StructType([StructField('num_songs', IntegerType(), False),
                     StructField('artist_id', StringType(), False),
                     StructField('artist_latitude', DoubleType(), False),
                     StructField('artist_longitude', DoubleType(), False),
                     StructField('artist_location', StringType(), False),
                     StructField('artist_name', StringType(),False),
                     StructField('song_id', StringType(), False),
                     StructField('title', StringType(), False),
                     StructField('duration', FloatType(), False),
                     StructField('year', IntegerType(), False)])

log_schema = StructType([StructField('artist', StringType(), True),
                     StructField('auth', StringType(), True),
                     StructField('firstName', StringType(), True),
                     StructField('gender', StringType(), True),
                     StructField('itemInSession', StringType(), True),
                     StructField('lastName', StringType(),True),
                     StructField('length', StringType(), True),
                     StructField('level', StringType(), True),
                     StructField('location', StringType(), True),
                     StructField('method', StringType(), True),
                     StructField('page', StringType(), True),
                     StructField('registration', StringType(), True),
                     StructField('sessionId', StringType(), True),
                     StructField('song', StringType(), True),
                     StructField('status', StringType(), True),
                     StructField('ts', StringType(), True),
                     StructField('userAgent', StringType(), True),
                     StructField('userId', StringType(), True)])

## There is something wrong in the data in the log jsons that throw an error when trying to cast as a non-String 
## type at import which causes the entire row to be nulled.  For this reason, we'll still keep consistant with 
## song_data and define a schema but for log_schema all types will be String.

In [None]:
song_data = spark.read.schema(song_schema).option("multiline","true").json(song_files)
log_data = spark.read.schema(log_schema).option("multiline","true").json(log_files)

In [None]:
song_data.printSchema()

In [None]:
log_data.printSchema()

#### Data Cleaning
From previous inspection of this data, we know there are multiple null entries as well as invalid year entries (reads as 0)

In [None]:
song_data_clean = song_data.na.drop()
log_data_clean = log_data.na.drop()
song_data_clean = song_data_clean.filter(song_data_clean.year != "0")
log_data_clean = log_data_clean.filter(log_data_clean.userId != "")

In [None]:
song_data_clean.count()

In [None]:
log_data_clean.count()

#### SQL queries

In [None]:
song_data_clean.createOrReplaceTempView("song_table")
log_data_clean.createOrReplaceTempView("log_table")

In [None]:
spark.sql('''
          SELECT *
          FROM song_table
          '''
         ).show()


In [None]:
##Convert epoch_seconds to timestamp
log_data_clean.select(col("ts").cast(IntegerType())).show()

In [None]:
log_data_clean.select(col("ts")).show()

In [None]:
def stripDQ(string):
    return string.replace('"', "")

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType
udf_stripDQ = udf(stripDQ, StringType())

In [None]:
test = log_data_clean.withColumn("ts", udf_stripDQ(log_data_clean["ts"]))

In [None]:
test.show()

In [None]:
test2 = test.select(test["ts"].asType('int').alias("timestamp"))

In [None]:
test2.show()

In [None]:
test.first

In [None]:
test2 = test.withColumn('test', regexp_replace('ts',"[^0-9]","").cast(IntegerType())).select("ts","test")

In [None]:
test2.show()

In [None]:
get_timestamp = udf(lambda x: x/1000, TimestampType())
test2 = test.withColumn('start_time',get_timestamp(test.ts))

In [None]:
test.first()

In [None]:
# create datetime column from original timestamp column
get_datetime = udf(lambda x: from_unixtime(x/1000), TimestampType())
test2 = test.withColumn('datetime', get_datetime('ts'))

In [None]:
test2.show()

In [None]:
test2 = test.withColumn(col('ts')).cast(IntegerType())