 Extract International Classification of Diseases (ICD-10), from the World Health Organization (WHO) API
 Store ICD-10 data as a Parquet file in the container bronze, on Azure Data Lake Storage Gen2 (ADLS Gen2) account.

Client ID and Client Password is stored in Azure Key Vault on Databricks side scope 'KeyVault-ICD-secrets' is created.
#secrets/createScope -> DNS name & Resource ID of keyvault taken from key-vault -> setting -> properties


In [0]:

import requests
# to make http request to webservice
from pyspark.sql import SparkSession
# dataframes library 
from pyspark.sql.functions import current_date, lit
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType

#'KeyVault-ICD-secrets' - secret scope in Databricks,linked to an Azure Key Vault.
# 'ICD10-client-id' and 'IDC10-client-secret' are the names of the secrets stored within that scope.
client_id = dbutils.secrets.get(scope="KeyVault-ICD-secrets", key="ICD10-client-id")
client_secret = dbutils.secrets.get(scope="KeyVault-ICD-secrets", key="IDC10-client-secret")

base_url = 'https://id.who.int/icd/'
current_date=datetime.now().date()



In [0]:
# Get access token from the WHO ICD API. This token is required for making authorized requests to fetch the actual ICD10 data.
auth_url = 'https://icdaccessmanagement.who.int/connect/token'
auth_response = requests.post(auth_url, data={
    'client_id': client_id,
    'client_secret': client_secret,
    'grant_type': 'client_credentials' # Specifies the OAuth 2.0 grant type
})

if auth_response.status_code == 200:
    access_token = auth_response.json().get('access_token') # Extracts the access token from the successful response
else:
    # If authentication fails, an exception is raised with details for debugging.
    raise Exception(f"Failed to obtain access token: {auth_response.status_code} - {auth_response.text}")

# Headers to be used for subsequent API calls to fetch ICD data
headers = {
    'Authorization': f'Bearer {access_token}', # Standard header for OAuth 2.0 bearer tokens
    'API-Version': 'v2',                      # Specifies the desired API version
    'Accept-Language': 'en',                  # Requests data in English
}

In [0]:

# functions to nevigate & fetch ICD10 codes
def fetch_icd_codes(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Failed to fetch data: {response.status_code} - {response.text}")

def extract_codes(url):
    data = fetch_icd_codes(url)
    codes = []
    if 'child' in data:
        for child_url in data['child']:
            codes.extend(extract_codes(child_url))
    else:
        if 'code' in data and 'title' in data:
            # print(data['code'],data['title']['@value'])
            codes.append({
                'icd_code': data['code'],
                'icd_code_type': 'ICD-10',
                'code_description': data['title']['@value'],
                'inserted_date': current_date,
                'updated_date': current_date,
                'is_current_flag': True
            })
    return codes



In [0]:
# Defines the starting URL for the ICD-10 (2019 release) A00-A09 chapter.
# The recursive 'extract_codes' function will traverse this hierarchy.
root_url = 'https://id.who.int/icd/release/10/2019/A00-A09'
icd_codes = extract_codes(root_url) # Initiates the data extraction

# Defines the schema for the Spark DataFrame explicitly. This is good practice
# for ensuring data types and column names are correct and consistent.
schema = StructType([
    StructField("icd_code", StringType(), True),         # ICD Code (e.g., "A00.0")
    StructField("icd_code_type", StringType(), True),    # Type of ICD (e.g., "ICD-10")
    StructField("code_description", StringType(), True), # Description of the code
    StructField("inserted_date", DateType(), True),      # Date when the record was inserted
    StructField("updated_date", DateType(), True),      # Date when the record was last updated
    StructField("is_current_flag", BooleanType(), True)  # Flag indicating if the record is current
])

print(icd_codes) # Prints the raw list of dictionaries collected from the API (useful for debugging)

# Creates a Spark DataFrame from the 'icd_codes' list, applying the predefined schema.
df = spark.createDataFrame(icd_codes, schema=schema)


In [0]:
# we will store the data in icd_codes folder in bronze container
file_path = "abfss://bronze@emrgen2.dfs.core.windows.net/icd_codes"

# append new data to existing file
df.write.format("parquet").mode("append").save(file_path)