## Data Analysis of Data Source

###  Environment Setup


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

### Data Import

In [7]:
filepath = 'C:/Users/CTW00913-Admin/PycharmProjects/EarthQuake/data/earthquake_raw.json'

raw_df = spark.read\
    .option('multiline', 'true')\
    .option('mode', 'PERMISSIVE')\
    .json(filepath)


earth_quake_df = raw_df.withColumn('Exp_RESULTS', F.explode(F.col('features')))\
    .drop('features')\
    .select('Exp_RESULTS.geometry.coordinates',
           'Exp_RESULTS.id',
           'Exp_RESULTS.properties.*')\


root
 |-- bbox: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- features: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- geometry: struct (nullable = true)
 |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- properties: struct (nullable = true)
 |    |    |    |-- alert: string (nullable = true)
 |    |    |    |-- cdi: double (nullable = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- detail: string (nullable = true)
 |    |    |    |-- dmin: double (nullable = true)
 |    |    |    |-- felt: long (nullable = true)
 |    |    |    |-- gap: double (nullable = true)
 |    |    |    |-- ids: string (nullable = true)
 |    |    |    |-- mag: double (nullable = true)
 |    |    |    |-- magType: string (nullable = true

### Data Analysis

#### Data Schema

In [3]:
earth_quake_df.printSchema()

root
 |-- coordinates: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- id: string (nullable = true)
 |-- alert: string (nullable = true)
 |-- cdi: double (nullable = true)
 |-- code: string (nullable = true)
 |-- detail: string (nullable = true)
 |-- dmin: double (nullable = true)
 |-- felt: long (nullable = true)
 |-- gap: double (nullable = true)
 |-- ids: string (nullable = true)
 |-- mag: double (nullable = true)
 |-- magType: string (nullable = true)
 |-- mmi: double (nullable = true)
 |-- net: string (nullable = true)
 |-- nst: long (nullable = true)
 |-- place: string (nullable = true)
 |-- rms: double (nullable = true)
 |-- sig: long (nullable = true)
 |-- sources: string (nullable = true)
 |-- status: string (nullable = true)
 |-- time: long (nullable = true)
 |-- title: string (nullable = true)
 |-- tsunami: long (nullable = true)
 |-- type: string (nullable = true)
 |-- types: string (nullable = true)
 |-- tz: string (nullable = true)
 |-- update

#### Null Values


In [4]:
null_values = {
    col: earth_quake_df.filter(earth_quake_df[col].isNull()).count()
    for col in earth_quake_df.columns
}

for key, value in null_values.items():
    print(key, '->', value)

coordinates -> 0
id -> 0
alert -> 178
cdi -> 164
code -> 0
detail -> 0
dmin -> 51
felt -> 164
gap -> 37
ids -> 0
mag -> 0
magType -> 0
mmi -> 175
net -> 0
nst -> 55
place -> 0
rms -> 0
sig -> 0
sources -> 0
status -> 0
time -> 0
title -> 0
tsunami -> 0
type -> 0
types -> 0
tz -> 178
updated -> 0
url -> 0


#### Duplicates

In [5]:
total_rows = earth_quake_df.count()
unique_rows = earth_quake_df.distinct().count()
duplicates_rows = total_rows - unique_rows

print(duplicates_rows)


0
