In [0]:
SCHEMA_NAME = "resp_health_db"
spark.sql(f"USE {SCHEMA_NAME}")
print("Current schema:", spark.sql("SELECT current_database()").first()[0])


In [0]:
import requests
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark: SparkSession  # just for type hint

url = (
    "https://api.weather.gc.ca/collections/climate-hourly/items"
    "?limit=200&lang=en"
)

resp = requests.get(url)
resp.raise_for_status()
geojson = resp.json()

features = geojson.get("features", [])
print("Number of features:", len(features))

# Flatten to pandas
records = []
for f in features:
    props = f.get("properties", {})
    records.append(props)

pdf = pd.DataFrame(records)
pdf.head()


In [0]:
weather_df = spark.createDataFrame(pdf)
display(weather_df.limit(10))
print(weather_df.columns)


In [0]:
raw_weather = weather_df  # or whatever variable you used
from pyspark.sql import functions as F


due to the 2024-09-14 18:00:00 18
had to remove the extra 18 (garbage values)

In [0]:
from pyspark.sql import functions as F

df = raw_weather


In [0]:
df = df.withColumn(
    "hour_int",
    F.regexp_extract(F.col("LOCAL_HOUR").cast("string"), r"(\d{1,2})", 1).cast("int")
)

display(df.select("LOCAL_DATE", "LOCAL_HOUR", "hour_int").limit(10))


In [0]:
from pyspark.sql import functions as F

# Start clean
df = raw_weather

# Use LOCAL_DATE as the timestamp (midnight for that day)
df = df.withColumn(
    "timestamp",
    F.to_timestamp(F.col("LOCAL_DATE"))   # ONLY ONE ARGUMENT
)

display(df.select("LOCAL_DATE", "timestamp").limit(10))


In [0]:
clean_weather = (
    df
    .withColumn("province", F.col("PROVINCE_CODE").cast("string"))
    .withColumn("location", F.col("STATION_NAME").cast("string"))
    .withColumn("temperature_c", F.col("TEMP").cast("double"))
    .withColumn("wind_chill_c", F.col("WINDCHILL").cast("double"))
    .withColumn("humidity_percent", F.col("RELATIVE_HUMIDITY").cast("double"))
    .withColumn(
        "alert_level",
        F.when(F.col("WINDCHILL") <= -30, F.lit("Extreme Cold Warning"))
         .when(F.col("WINDCHILL") <= -20, F.lit("Very Cold"))
         .otherwise(F.lit(None).cast("string"))
    )
    .withColumn("raw_source", F.lit(None).cast("string"))
    .withColumn("created_at", F.current_timestamp())
    .select(
        "timestamp",
        "province",
        "location",
        "temperature_c",
        "wind_chill_c",
        "humidity_percent",
        "alert_level",
        "raw_source",
        "created_at"
    )
    .where(F.col("timestamp").isNotNull())
)

display(clean_weather.limit(10))


In [0]:
clean_weather.write.format("delta").mode("append").saveAsTable("weather_conditions")

spark.sql("SELECT * FROM weather_conditions LIMIT 10").show(truncate=False)
