In [0]:
# retrieve the task values from the previous task i.e., bronze notebook
bronze_output = dbutils.jobs.taskValues.get(taskKey = 'bronze', key = 'bronze_output')
# access individual parameters
start_date = bronze_output.get('start_date', '')
bronze_adls = bronze_output.get('bronze_adls', '')
silver_adls = bronze_output.get('silver_adls', '')
# print the retrieved values
print(f"Start Date: {start_date}")
print(f"Bronze ADLS: {bronze_adls}")
print(f"Silver ADLS: {silver_adls}")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import date, timedelta

In [0]:
# read the JSON data frpm the bronze ADLS
df = spark.read\
    .format('json')\
        .option('multiline', True)\
            .load(f"{bronze_adls}/{start_date}_earthquake_data.json")

In [0]:
df.printSchema()

root
 |-- geometry: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- alert: string (nullable = true)
 |    |-- cdi: double (nullable = true)
 |    |-- code: string (nullable = true)
 |    |-- detail: string (nullable = true)
 |    |-- dmin: double (nullable = true)
 |    |-- felt: long (nullable = true)
 |    |-- gap: double (nullable = true)
 |    |-- ids: string (nullable = true)
 |    |-- mag: double (nullable = true)
 |    |-- magType: string (nullable = true)
 |    |-- mmi: double (nullable = true)
 |    |-- net: string (nullable = true)
 |    |-- nst: long (nullable = true)
 |    |-- place: string (nullable = true)
 |    |-- rms: double (nullable = true)
 |    |-- sig: long (nullable = true)
 |    |-- sources: string (nullable = true)
 |    |-- status: string (nullable = true)
 | 

In [0]:
df.limit(5).display()

geometry,id,properties,type
"List(List(-116.4486667, 34.31, 8.73), Point)",ci41143408,"List(null, null, 41143408, https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci41143408&format=geojson, 0.05776, null, 70.0, ,ci41143408,, 1.15, ml, null, ci, 27, 22 km N of Yucca Valley, CA, 0.11, 20, ,ci,, reviewed, 1746229399520, M 1.2 - 22 km N of Yucca Valley, CA, 0, earthquake, ,nearby-cities,origin,phase-data,scitech-link,, null, 1746230479489, https://earthquake.usgs.gov/earthquakes/eventpage/ci41143408)",Feature
"List(List(-152.9035, 60.3, 120.4), Point)",ak0255m5hvxv,"List(null, null, 0255m5hvxv, https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ak0255m5hvxv&format=geojson, null, null, null, ,ak0255m5hvxv,, 1.6, ml, null, ak, null, 73 km WNW of Ninilchik, Alaska, 0.42, 39, ,ak,, automatic, 1746229201794, M 1.6 - 73 km WNW of Ninilchik, Alaska, 0, earthquake, ,origin,phase-data,, null, 1746229363527, https://earthquake.usgs.gov/earthquakes/eventpage/ak0255m5hvxv)",Feature
"List(List(-150.5436, 60.5389, 13.8), Point)",ak0255m5cfgq,"List(null, null, 0255m5cfgq, https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ak0255m5cfgq&format=geojson, null, null, null, ,ak0255m5cfgq,, 1.6, ml, null, ak, null, 12 km E of Sterling, Alaska, 0.46, 39, ,ak,, automatic, 1746227695463, M 1.6 - 12 km E of Sterling, Alaska, 0, earthquake, ,origin,phase-data,, null, 1746227819634, https://earthquake.usgs.gov/earthquakes/eventpage/ak0255m5cfgq)",Feature
"List(List(-155.298833333333, 19.9691666666667, 31.0), Point)",hv74664712,"List(null, 3.7, 74664712, https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=hv74664712&format=geojson, 0.05899, 70, 147.0, ,hv74664712,us7000pwsp,, 3.58, ml, 3.301, hv, 62, 6 km WSW of Laupāhoehoe, Hawaii, 0.14, 223, ,hv,us,, reviewed, 1746226780400, M 3.6 - 6 km WSW of Laupāhoehoe, Hawaii, 0, earthquake, ,dyfi,origin,phase-data,shakemap,, null, 1746233571825, https://earthquake.usgs.gov/earthquakes/eventpage/hv74664712)",Feature
"List(List(-150.5616, 60.546, 8.2), Point)",ak0255m4xbr6,"List(null, null, 0255m4xbr6, https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ak0255m4xbr6&format=geojson, null, null, null, ,ak0255m4xbr6,, 1.7, ml, null, ak, null, 11 km E of Sterling, Alaska, 0.55, 44, ,ak,, automatic, 1746225848984, M 1.7 - 11 km E of Sterling, Alaska, 0, earthquake, ,origin,phase-data,, null, 1746225964534, https://earthquake.usgs.gov/earthquakes/eventpage/ak0255m4xbr6)",Feature


In [0]:
# extract the columns as dictionaries into individual variables
df = df.select(
        'id',
        col('geometry.coordinates').getItem(0).alias('longitude'),
        col('geometry.coordinates').getItem(1).alias('latitude'),
        col('geometry.coordinates').getItem(2).alias('elevation'),
        col('properties.title').alias('title'),
        col('properties.place').alias('place_description'),
        col('properties.sig').alias('sig'),
        col('properties.mag').alias('mag'),
        col('properties.magType').alias('magType'),
        col('properties.time').alias('time'),
        col('properties.updated').alias('updated')
    )

In [0]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- title: string (nullable = true)
 |-- place_description: string (nullable = true)
 |-- sig: long (nullable = true)
 |-- mag: double (nullable = true)
 |-- magType: string (nullable = true)
 |-- time: long (nullable = true)
 |-- updated: long (nullable = true)



In [0]:
df.limit(5).display()

id,longitude,latitude,elevation,title,place_description,sig,mag,magType,time,updated
ci41143408,-116.4486667,34.31,8.73,"M 1.2 - 22 km N of Yucca Valley, CA","22 km N of Yucca Valley, CA",20,1.15,ml,1746229399520,1746230479489
ak0255m5hvxv,-152.9035,60.3,120.4,"M 1.6 - 73 km WNW of Ninilchik, Alaska","73 km WNW of Ninilchik, Alaska",39,1.6,ml,1746229201794,1746229363527
ak0255m5cfgq,-150.5436,60.5389,13.8,"M 1.6 - 12 km E of Sterling, Alaska","12 km E of Sterling, Alaska",39,1.6,ml,1746227695463,1746227819634
hv74664712,-155.298833333333,19.9691666666667,31.0,"M 3.6 - 6 km WSW of Laupāhoehoe, Hawaii","6 km WSW of Laupāhoehoe, Hawaii",223,3.58,ml,1746226780400,1746233571825
ak0255m4xbr6,-150.5616,60.546,8.2,"M 1.7 - 11 km E of Sterling, Alaska","11 km E of Sterling, Alaska",44,1.7,ml,1746225848984,1746225964534


In [0]:
# in this data, null latitude, null longitude, null elevation can be filled as 0 lat, 0lon, 0 elevation respectively
df = df.withColumn('longitude', when(col('longitude').isNull(), 0).otherwise(col('longitude')))\
    .withColumn('latitude', when(col('latitude').isNull(), 0).otherwise(col('latitude')))\
        .withColumn('elevation', when(col('elevation').isNull(), 0).otherwise(col('elevation')))

In [0]:
# change the columns 'time' and 'updated' from unix time to timestamp
df = df.withColumn('time', (col('time')/1000).cast(TimestampType()))\
    .withColumn('updated', (col('updated')/1000).cast(TimestampType()))

In [0]:
# write the data to the silver ADLS
write_path = f"{silver_adls}/{start_date}_earthquake_events"
df.write\
    .format('parquet')\
        .mode('overwrite')\
            .save(write_path)

In [0]:
# define the output variables
dbutils.jobs.taskValues.set(key = 'silver_output', value = write_path)