In [0]:
# 01_ingest_virus_phac - Cell 1

SCHEMA_NAME = "resp_health_db"  # same as in 00_setup_and_tables

spark.sql(f"USE {SCHEMA_NAME}")
print("Current schema:", spark.sql("SELECT current_database()").first()[0])


In [0]:
from pyspark.sql import functions as F

SCHEMA_NAME = "resp_health_db"
spark.sql(f"USE {SCHEMA_NAME}")

raw_df = spark.table("lab_virus_raw")

display(raw_df.limit(10))
print("Columns:", raw_df.columns)


In [0]:
df = raw_df.withColumnRenamed("Week ending date", "week_ending_date") \
           .withColumnRenamed("Percent of tests positive", "percent_positive")

clean_df = (
    df
    .withColumn("source_level", F.lit("national"))
    .withColumn("data_source", F.lit("PHAC_Laboratory"))
    .withColumn("report_date", F.to_date(F.col("week_ending_date")))
    .withColumn("province", F.col("Jurisdiction"))          # usually 'Canada'
    .withColumn("virus_type", F.col("Virus").cast("string"))
    .withColumn("metric_type", F.lit("percent_positive"))
    .withColumn("metric_value", F.col("percent_positive").cast("double"))
    .withColumn("created_at", F.current_timestamp())
    .select(
        "source_level",
        "data_source",
        "report_date",
        "province",
        "virus_type",
        "metric_type",
        "metric_value",
        "created_at"
    )
    .where(F.col("report_date").isNotNull())
)

display(clean_df.limit(10))


In [0]:
clean_df.write.format("delta").mode("append").saveAsTable("respiratory_activity")


In [0]:
spark.sql("SELECT * FROM respiratory_activity LIMIT 20").show(truncate=False)


In [0]:
from pyspark.sql import functions as F

SCHEMA_NAME = "resp_health_db"
spark.sql(f"USE {SCHEMA_NAME}")

clinical_raw = spark.table("clinical_virus_raw")

display(clinical_raw.limit(10))
print("Columns:", clinical_raw.columns)


In [0]:
clinical_df = clinical_raw.withColumnRenamed("Week ending date", "week_ending_date")

clinical_clean = (
    clinical_df
    .withColumn("source_level", F.lit("national"))
    .withColumn("data_source", F.lit("PHAC_Clinical"))
    .withColumn("report_date", F.to_date(F.col("week_ending_date")))
    .withColumn("province", F.col("Jurisdiction"))
    .withColumn("virus_type", F.col("Virus").cast("string"))
    .withColumn("metric_type", F.col("Measure").cast("string"))   # e.g. 'Outbreaks'
    .withColumn("metric_value", F.col("Count").cast("double"))
    .withColumn("created_at", F.current_timestamp())
    .select(
        "source_level",
        "data_source",
        "report_date",
        "province",
        "virus_type",
        "metric_type",
        "metric_value",
        "created_at"
    )
    .where(F.col("report_date").isNotNull())
)

display(clinical_clean.limit(10))


In [0]:
clinical_clean.write.format("delta").mode("append").saveAsTable("respiratory_activity")


In [0]:
spark.sql("""
SELECT source_level, data_source, report_date, province, virus_type, metric_type, metric_value
FROM respiratory_activity
ORDER BY report_date DESC
LIMIT 30
""").show(truncate=False)
