# Health Updates Challenge
This notebook is used only as part 1 of the challenge and as a setup for the workflow that is run by notebooks 1 and 2

## Mounting the ADLS container
The health-updates container was previously created through azure portal

In [0]:
#dbutils.fs.unmount('/mnt/health-updates')

In [0]:
#databricks url = #secrets/createScope
application_id = dbutils.secrets.get(scope="databricks-secrets-nsp", key="application-id") 
directory_id = dbutils.secrets.get(scope="databricks-secrets-nsp", key="directory-id")
secret = dbutils.secrets.get(scope="databricks-secrets-nsp", key="secretv2")

In [0]:
container_name = 'health-updates'
account_name = 'datalakensp'
mount_point = '/mnt/health-updates'

In [0]:
configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": application_id,
          "fs.azure.account.oauth2.client.secret": secret,
          "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{directory_id}/oauth2/token"}
dbutils.fs.mount(source = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/",
mount_point = mount_point,
extra_configs = configs)

## Process the data
Process the 'health_status_updates.csv' file from the bronze folder into the silver folder and call it 'health_date' with an additional column called 'updated_timestamp' consisting of the current_timestamp at which the data is inserted into the silver folder

In [0]:
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType

mount_point = '/mnt/health-updates'
health_data_path = f"{mount_point}/bronze/health_status_updates.csv"

health_data_schema = StructType([
                    StructField("STATUS_UPDATE_ID", IntegerType(), False),
                    StructField("PATIENT_ID", IntegerType(), False),
                    StructField("DATE_PROVIDED", StringType(), False),
                    StructField("FEELING_TODAY", StringType(), True),
                    StructField("IMPACT", StringType(), True),
                    StructField("INJECTION_SITE_SYMPTOMS", StringType(), True),
                    StructField("HIGHEST_TEMP", DoubleType(), True),
                    StructField("FEVERISH_TODAY", StringType(), True),
                    StructField("GENERAL_SYMPTOMS", StringType(), True),
                    StructField("HEALTHCARE_VISIT", StringType(), True)
                    ]
                    )

health_updates = spark.read.csv(path = health_data_path, header=True, schema=health_data_schema)
health_updates.display()

In [0]:
from pyspark.sql.functions import to_date, current_timestamp
health_updates = health_updates.select(
                                'STATUS_UPDATE_ID',
                                'PATIENT_ID',
                                to_date(health_updates['DATE_PROVIDED'],'MM/dd/yyyy').alias('DATE_PROVIDED'),
                                'FEELING_TODAY',
                                'IMPACT',
                                'INJECTION_SITE_SYMPTOMS',
                                'HIGHEST_TEMP',
                                'FEVERISH_TODAY',
                                'GENERAL_SYMPTOMS',
                                'HEALTHCARE_VISIT',
                                current_timestamp().alias("UPDATED_TIMESTAMP")
                            )

## Creating the database and storing the data
'health_data' should be external delta lake format with the underlying data in the silver folder and the table itself as part of a new 'healthcare' database

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS healthcare;

In [0]:
health_updates.write.format("delta").mode('overwrite').option("path", f"{mount_point}/silver/health_data").saveAsTable("healthcare.health_data")