In [0]:
SCHEMA_NAME = "resp_health_db"
spark.sql(f"USE {SCHEMA_NAME}")
spark.sql("SHOW TABLES").show()


In [0]:
virus_daily_sql = """
CREATE OR REPLACE TEMP VIEW virus_daily AS
SELECT
  report_date AS date,
  province,
  -- average percent positive across viruses
  AVG(CASE
        WHEN metric_type IN ('percent_positive', 'positivity_rate')
        THEN metric_value
      END) AS avg_positivity,
  -- total clinical counts (e.g. outbreaks, hospitalizations)
  SUM(CASE
        WHEN metric_type NOT IN ('percent_positive', 'positivity_rate')
        THEN COALESCE(metric_value, 0)
      END) AS total_clinical_count
FROM respiratory_activity
GROUP BY report_date, province
"""

spark.sql(virus_daily_sql)
spark.sql("SELECT * FROM virus_daily LIMIT 10").show(truncate=False)


In [0]:
weather_daily_sql = """
CREATE OR REPLACE TEMP VIEW weather_daily AS
SELECT
  DATE(timestamp) AS date,
  province,
  AVG(temperature_c)      AS avg_temp,
  AVG(wind_chill_c)       AS avg_wind_chill,
  AVG(humidity_percent)   AS avg_humidity
FROM weather_conditions
GROUP BY DATE(timestamp), province
"""

spark.sql(weather_daily_sql)
spark.sql("SELECT * FROM weather_daily LIMIT 10").show(truncate=False)


join and scale data

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

virus_df   = spark.table("virus_daily")
weather_df = spark.table("weather_daily")

# Inner join on date + province (you can change to left join if you want)
joined = (
    virus_df.alias("v")
    .join(
        weather_df.alias("w"),
        on=["date", "province"],
        how="inner"
    )
)

# ---- VIRUS RISK: combine positivity + clinical counts, then min-max scale per province ----
virus_base = (
    joined
    .withColumn(
        "virus_intensity",
        F.coalesce(F.col("avg_positivity"), F.lit(0.0)) +
        F.coalesce(F.col("total_clinical_count"), F.lit(0.0))
    )
)

w_prov = Window.partitionBy("province")

virus_scaled = (
    virus_base
    .withColumn("virus_min", F.min("virus_intensity").over(w_prov))
    .withColumn("virus_max", F.max("virus_intensity").over(w_prov))
    .withColumn(
        "virus_risk_score",
        F.when(F.col("virus_max") == F.col("virus_min"), 0.0)
         .otherwise(
             (F.col("virus_intensity") - F.col("virus_min")) /
             (F.col("virus_max") - F.col("virus_min")) * 100.0
         )
    )
)

# ---- COLD RISK: colder = higher risk; scale using (0 - temperature/windchill) ----
cold_base = virus_scaled.withColumn(
    "cold_index",
    F.when(F.col("avg_wind_chill").isNotNull(), -F.col("avg_wind_chill"))
     .otherwise(-F.col("avg_temp"))
)

cold_scaled = (
    cold_base
    .withColumn("cold_min", F.min("cold_index").over(w_prov))
    .withColumn("cold_max", F.max("cold_index").over(w_prov))
    .withColumn(
        "cold_risk_score",
        F.when(F.col("cold_max") == F.col("cold_min"), 0.0)
         .otherwise(
             (F.col("cold_index") - F.col("cold_min")) /
             (F.col("cold_max") - F.col("cold_min")) * 100.0
         )
    )
)

# ---- Combined risk: simple average of virus + cold ----
scored = cold_scaled.withColumn(
    "combined_risk_score",
    (F.col("virus_risk_score") + F.col("cold_risk_score")) / 2.0
)


In [0]:
scored = scored.withColumn(
    "risk_category",
    F.when(F.col("combined_risk_score") < 25,  F.lit("Low"))
     .when(F.col("combined_risk_score") < 50, F.lit("Moderate"))
     .when(F.col("combined_risk_score") < 75, F.lit("High"))
     .otherwise(F.lit("Very High"))
)

risk_scores_df = (
    scored
    .select(
        F.col("date"),
        F.col("province"),
        F.col("virus_risk_score"),
        F.col("cold_risk_score"),
        F.col("combined_risk_score"),
        F.col("risk_category")
    )
    .withColumn("created_at", F.current_timestamp())
)

risk_scores_df.show(10, truncate=False)


In [0]:
risk_scores_df.write.format("delta").mode("overwrite").saveAsTable("risk_scores")

spark.sql("SELECT * FROM risk_scores LIMIT 20").show(truncate=False)


In [0]:
SCHEMA_NAME = "resp_health_db"
spark.sql(f"USE {SCHEMA_NAME}")

risk_scores_sdf = spark.table("risk_scores")
display(risk_scores_sdf)


In [0]:
SCHEMA_NAME = "resp_health_db"
spark.sql(f"USE {SCHEMA_NAME}")

risk_scores_sdf = spark.table("risk_scores")
risk_scores_sdf.show(5, truncate=False)
print("Count:", risk_scores_sdf.count())
