In [0]:
import requests
from datetime import date
from pyspark.sql import SparkSession

current_date = date.today() # Gets today's date, suitable for Spark's DateType

# Initialize Spark session (often pre-initialized in Databricks)
spark = SparkSession.builder.appName("NPI Data").getOrCreate()

# Base URL for the NPI Registry API
base_url = "https://npiregistry.cms.hhs.gov/api/"

In [0]:
# Defining the parameters for the initial API request
params = {
    "version": "2.1",       # Specifies the API version
    "state": "CA",          # Example: Search for providers in California
    "city": "Los Angeles",  # Example: Specifically in Los Angeles
    "limit": 20,            # Limits the initial results to 20 for demonstration
}

# Make the initial API request
response = requests.get(base_url, params=params)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    npi_data = response.json() # Parse the JSON response
    # Extract only the NPI 'number' from each result in the 'results' list
    npi_list = [result["number"] for result in npi_data.get("results", [])]
    # ... (rest of the code for detailed fetching)
else:
    print(f"Failed to fetch data: {response.status_code} - {response.text}")

In [0]:
# Initialize a list to store detailed NPI information (dictionaries)
detailed_results = []

# Loop through each NPI from the initial list
for npi in npi_list:
    detail_params = {"version": "2.1", "number": npi} # Parameters for specific NPI
    detail_response = requests.get(base_url, params=detail_params)

    if detail_response.status_code == 200:
        detail_data = detail_response.json()
        if "results" in detail_data and detail_data["results"]:
            for result in detail_data["results"]: # Iterate through potential multiple results for an NPI (though usually one)
                npi_number = result.get("number")
                basic_info = result.get("basic", {}) # Get the 'basic' information block
                
                # Differentiates between individual (NPI-1) and organizational (NPI-2) providers
                if result["enumeration_type"] == "NPI-1":
                    fname = basic_info.get("first_name", "")
                    lname = basic_info.get("last_name", "")
                else: # NPI-2 (Organizational)
                    fname = basic_info.get("authorized_official_first_name", "")
                    lname = basic_info.get("authorized_official_last_name", "")
                
                position = (
                    basic_info.get("authorized_official_title_or_position", "")
                    if "authorized_official_title_or_position" in basic_info
                    else ""
                )
                organisation = basic_info.get("organization_name", "")
                last_updated = basic_info.get("last_updated", "")
                
                # Appends a dictionary containing extracted fields to 'detailed_results'
                detailed_results.append(
                    {
                        "npi_id": npi_number,
                        "first_name": fname,
                        "last_name": lname,
                        "position": position,
                        "organisation_name": organisation,
                        "last_updated": last_updated,
                        "refreshed_at": current_date, # Adds the current date
                    }
                )

In [0]:
# Create a Spark DataFrame if detailed results were found
if detailed_results:
    print(detailed_results) # Prints the collected data (useful for debugging)
    df = spark.createDataFrame(detailed_results) # Creates the DataFrame
    display(df) # Displays the DataFrame in Databricks (for interactive viewing)
    
    # Save the DataFrame to ADLS Gen2 in Parquet format
    df.write.format("parquet").mode("overwrite").save("/mnt/bronze/npi_extract/")
    
    # Save the DataFrame as a Delta Lake table
    df.write.format("delta").mode("overwrite").saveAsTable("npi_extract")
else:
    print("No detailed results found.") # Message if no data was collected