In [0]:
# !pip install uv
# !uv sync
# dbutils.library.restartPython()

In [0]:
env_vars = {"CATALOG": "DEV", "SCHEMA": "DEV_MARIA"}

In [0]:
table_name = F'{env_vars["CATALOG"]}.{env_vars["SCHEMA"]}.sra_vigilance_db'

In [0]:
df_influenza_2024.write.mode("overwrite").saveAsTable(table_name)

In [0]:
print("num_rows = ", df_influenza_2024.count())
print("num_cols = ", len(df_influenza_2024.columns))

In [0]:
from pyspark.sql.functions import col, sum

In [0]:
print("column , null_count")
for column in df_influenza_2024.columns:
    null_count= df_influenza_2024.filter(col(column).isNull()).count()
    print(column, ", ", str(null_count))

In [0]:
df_influenza_2024.printSchema()

In [0]:
print("num_rows = ", df_influenza_2025.count())
print("num_cols = ", len(df_influenza_2025.columns))

In [0]:
cols_df_influenza_2025 = set(df_influenza_2025.columns)
cols_df_influenza_2024 = set(df_influenza_2024.columns)

print("Only in df_influenza_2025:", cols_df_influenza_2025 - cols_df_influenza_2024)
print("Only in df_influenza_2024:", cols_df_influenza_2024 - cols_df_influenza_2025)
print("In both:", cols_df_influenza_2025 & cols_df_influenza_2024)

In [0]:
df_influenza_2025.write.mode("append").saveAsTable(table_name)

In [0]:
srag_db = spark.read.table(table_name)

In [0]:
print("num_rows = ", srag_db.count())
print("num_cols = ", len(srag_db.columns))

In [0]:
sorted_columns = sorted(srag_db.columns)
display(
  srag_db
  .select(*sorted_columns)
  .limit(5)
  .toPandas()
)

In [0]:
selected_columns = [
    "NU_NOTIFIC",
    "DT_NOTIFIC",
    "DT_SIN_PRI",
    "SG_UF_NOT",
    "ID_MUNICIP",
    # "CO_MUN_NOT",
    "EVOLUCAO",
    "DT_EVOLUCA",
    "CLASSI_FIN",
    "NU_IDADE_N",
    "CS_SEXO",
    "FATOR_RISC",
    "CARDIOPATI",
    "DIABETES",
    "IMUNODEPRE",
    "OBESIDADE",
    "HOSPITAL",
    "DT_INTERNA",
    "UTI",
    "DT_ENTUTI",
    "DT_SAIDUTI",
    # "ID_UNIDADE",
    # "CO_UN_INTE",
    "SUPORT_VEN",
    "VACINA_COV",
    "DOSE_1_COV",
    "DOSE_2_COV",
    "DOSE_REF",
    "DOSE_2REF",
    "FAB_COV_1",
    "FAB_COV_2",
    # "FAB_COVRF",
    # "FAB_COVRF2",
    "FAB_RE_BI",
    "VACINA",
    "DT_UT_DOSE",
    "MAE_VAC",
    "DT_VAC_MAE"
]

In [0]:
# sorted(selected_columns)

In [0]:
srag_filtered = srag_db.select(selected_columns)
srag_filtered.write.mode("overwrite").saveAsTable(F'{env_vars["CATALOG"]}.{env_vars["SCHEMA"]}.srag_filtered')


In [0]:
# Step 1: Collect null counts in a list
null_data = [(c, srag_filtered.filter(col(c).isNull()).count()) for c in srag_filtered.columns]
null_df = spark.createDataFrame(null_data, ["column", "null_count"])
null_df = null_df.orderBy(col("null_count").desc())
null_df.show(50)

In [0]:
print("column , null_count")
for column in srag_filtered.columns:
    null_count= srag_filtered.filter(col(column).isNull()).count()
    print(column, ", ", str(null_count))

In [0]:
srag_sample = srag_filtered.sample(withReplacement=None, fraction=0.1, seed=42)
srag_sample.write.mode("overwrite").saveAsTable(F'{env_vars["CATALOG"]}.{env_vars["SCHEMA"]}.srag_filtered_sample')

In [0]:
display(
  srag_db
  .select(selected_columns)
  .limit(5)
  .toPandas()
)

In [0]:
print("num_rows = ", srag_sample.count())
print("num_cols = ", len(srag_sample.columns))

In [0]:
srag_sample.show(2)

In [0]:
print("column , null_count")
for column in srag_sample.columns:
    null_count= srag_sample.filter(col(column).isNull()).count()
    print(column, ", ", str(null_count))

In [0]:
display(srag_filtered.distinct().show())

## Metrics

## Request test

In [0]:
import pprint
import requests
import json
import pandas as pd

In [0]:
srag_2021_2024_link = "https://opendatasus.saude.gov.br/api/3/action"

# Make the HTTP request
response = requests.get(f"{srag_2021_2024_link}/package_list")

# Use the json module to load CKAN's response into a dictionary
response_dict = json.loads(response.content)

# Check the contents of the response
assert response_dict['success'] is True  # make sure if response is OK

datasets = response_dict['result']         # extract all the packages from the response
print("Total datasets: ", len(datasets))                       # print the total number of datasets
datasets[-10:]

In [0]:
# Specify the2021 -2024 package id:
package_id = "39a4995f-4a6e-440f-8c8f-b00c81fae0d0"

# Base url for package information. This is always the same.
base_url = 'https://opendata.swiss/api/3/action/package_show?id='

# Construct the url for the package of interest
package_information_url = f"{srag_2021_2024_link}/package_show?id={package_id}"

# Make the HTTP request
package_information = requests.get(package_information_url)

# Use the json module to load CKAN's response into a dictionary
package_dict = json.loads(package_information.content)

# Check the contents of the response.
assert package_dict['success'] is True  # Make sure if response is OK
# package_dict = package_dict['result']   # we only need the 'result' part from the dictionary


In [0]:
package_dict.keys()

In [0]:
package_dict['result'].keys()

In [0]:
for i in range(18):
    print(package_dict["result"]["resources"][i]["url"])