In [0]:
import requests
import json
import time
from datetime import datetime
from pyspark.sql.functions import col, to_date, current_timestamp

In [0]:
# Defining paths
landing_zone_path = '/mnt/data/landing/weather'
today_string = datetime.now().strftime("%Y-%m-%d")
landing_path_today = f'{landing_zone_path}/{today_string}'
print('Todays Landing Path: ', landing_path_today)

bronze_table_path = '/mnt/data/bronze/weather'
bronze_table_name = 'weather.bronze_weather_raw'

# Defining API info
api_key = dbutils.secrets.get(scope = 'weather-keys', key = 'api-key')
endpoint = 'https://api.weatherstack.com/current'
cities = ['Gramado, Brazil', 'Punta Del Este, Uruguay', 'Punta Arenas, Chile']

In [0]:
for city in cities:
    response = requests.get(endpoint, params={'access_key': api_key, 'query': city})
    data = response.json()

    city_str = city.replace(' ', '_').replace(',', '').lower()
    timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_name = f'{city_str}_{timestamp_str}.json'
    output_path = f'{landing_path_today}/{file_name}'

    dbutils.fs.put(output_path, json.dumps(data), overwrite=True)
    print(f'File {file_name} written to {landing_path_today}')

    time.sleep(1)

In [0]:
files_in_landing = dbutils.fs.ls(landing_path_today)

if not files_in_landing:
    dbutils.notebook.exit("No files to process in landing zone")

In [0]:
raw_df = spark.read.text(landing_path_today)

bronze_df = raw_df.withColumnRenamed('value', 'raw_payload')\
                    .withColumn('ingestion_timestamp', current_timestamp())\
                    .withColumn('ingestion_date', to_date(col('ingestion_timestamp')))\
                    .withColumn('source_path', col('_metadata.file_path'))\
                    .withColumn('source_file_name', col('_metadata.file_name'))\
                    .withColumn('last_modified', col('_metadata.file_modification_time'))

display(bronze_df)

In [0]:
bronze_df.write.format("delta").mode("append").save(bronze_table_path) # Could also use .option("mergeSchema", "true") for schema evolution
print(f"{bronze_df.count()} rows added to {bronze_table_name}.")