In [1]:
import requests
import pyspark
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.functions import count
from pyspark.sql.functions import when
import json
from bunsen.stu3.bundles import load_from_directory, extract_entry

#### Get Request - HAPI FHIR API

In [2]:
response = requests.get("http://hapi.fhir.org/baseR4/Patient?address-state=California")

In [3]:
print(response)

<Response [200]>


#### Store JSON responses in FHIR directory

In [4]:
index = 0
while index <= 50:
    response = requests.get("http://hapi.fhir.org/baseR4/Patient")
    json_response = response.json()
    path = 'fhir/data' + str(index) + '.json'
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(json_response, f, ensure_ascii=False, indent=4)
    index += 1

#### Create Spark Session

In [5]:
spark = SparkSession.builder.appName('uds').getOrCreate()

#### Load bundles from FHIR directory

In [6]:
bundles = load_from_directory(spark, 'fhir')

In [7]:
patients = extract_entry(spark, bundles, 'patient').cache()
# The extract_entry method returns a Spark dataframe

In [8]:
df = patients.select('birthDate', 'gender')

#### Age & gender

In [9]:
df_with_age = df.withColumn('age', F.round( (F.datediff(F.current_date(), df.birthDate)) / 365) )

In [10]:
df_filtered =df_with_age.filter((df_with_age.age < 95) & (df_with_age.age >= 25) & (df_with_age.gender != 'unknown')) 

#### Map age to ageGroup values for UDS Table 3

In [11]:
df_mapped = df_filtered.withColumn("ageGroup", 
                        when(df_filtered.age.between(25, 29), "Ages 25-29")
                       .when(df_filtered.age.between(30, 34), "Ages 30-34")
                       .when(df_filtered.age.between(35, 39), "Ages 35-39")
                       .when(df_filtered.age.between(40, 44), "Ages 40-44")
                       .when(df_filtered.age.between(45, 49), "Ages 45-49")
                       .when(df_filtered.age.between(50, 54), "Ages 50-54")
                       .when(df_filtered.age.between(55, 59), "Ages 55-59")
                       .when(df_filtered.age.between(60, 64), "Ages 60-64")
                       .when(df_filtered.age.between(65, 69), "Ages 65-69")
                       .when(df_filtered.age.between(70, 74), "Ages 70-74")
                       .when(df_filtered.age.between(75, 79), "Ages 75-79")
                       .when(df_filtered.age.between(80, 84), "Ages 80-84")
                       .when(df_filtered.age >= 85, "Age 85 and over")
                       .otherwise(df_filtered.age))

#### Group by ageGroup

In [12]:
df_grouped = (df_mapped
    .groupby(df_mapped.ageGroup)
    .pivot("gender")
    .agg(count("birthDate"))
    ).sort('ageGroup').fillna(0)

In [14]:
df_grouped.show()

+----------+------+----+
|  ageGroup|female|male|
+----------+------+----+
|Ages 25-29|   120|   1|
|Ages 30-34|     0| 150|
|Ages 35-39|     0|  30|
|Ages 40-44|     2|   0|
|Ages 45-49|    30|   0|
|Ages 65-69|    30|   0|
+----------+------+----+

