In [1]:
!pip install wandb pathling pyspark nibabel minio requests boto3



In [2]:
import wandb
import os
from pathling.coding import Coding
from pyspark.sql import functions
from pathling.udfs import subsumes
from utils import get_spark_session, get_pathling_context, \
load_resources, extract_patient_id, extract_subject_id, save_artifact

# Setup wandb
os.environ['WANDB_PROJECT'] = 'diabetes-vaccines-notebook'
os.environ['WANDB_NOTEBOOK_NAME'] = 'Untitled.ipynb'
os.environ["WANDB_BASE_URL"] = "http://wandb:8082"
os.environ['WANDB_API_KEY'] = 'local-f68b4b71af977015844cb5987382d102a493b0eb'
os.environ['AWS_S3_ENDPOINT_URL'] = 'http://minio:9000'
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

wandb.login()
run = wandb.init(job_type="data_convert")

[34m[1mwandb[0m: Network error (ConnectionError), entering retry loop.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668663200107403, max=1.0…

Problem at: /tmp/ipykernel_4383/1564076644.py 19 <module>


KeyboardInterrupt: 

In [None]:
# Configure some constants for the demo
PROJECT_NAME = os.environ['WANDB_PROJECT']
ARTIFACT_NAME = 'training-data'
RESOURCES = ['Patient', 'Immunization', 'Condition']
START_BIRTH_DATE = '1920-01-01'
END_BIRTH_DATE = '2020-01-01'
IMMUNIZATION_CODE = '08' # Hepatitis B vaccine code
CONDITION_CODE = '73211009' # Diabetes SNOMED code

# Configure Spark, Pathling, and load resources
spark = get_spark_session()
pc = get_pathling_context(spark)
resource_data = load_resources(pc, RESOURCES)

In [None]:
###### 1. FILTERING INITIAL PATIENTS ######

# Filtering patients based on birthdate
patients = resource_data['Patient']
filtered_patients = patients.filter(
    (patients.birthDate < END_BIRTH_DATE) & 
    (patients.birthDate > START_BIRTH_DATE))

filtered_patients.select('gender', 'birthDate').show()

In [None]:
###### 2. SELECTING HEP B IMMUNIZATIONS ######

# Joining filtered patients with immunizations
immunizations = resource_data['Immunization']
patients_immunizations = filtered_patients.join(
    immunizations.select('patient', 'vaccineCode'),
    filtered_patients.id == extract_patient_id(immunizations),
    'left_outer'
)

# Selecting patient id and their vaccine coding
patients_immunizations = patients_immunizations.select(
    filtered_patients.id,
    functions.explode(
        patients_immunizations.vaccineCode.coding.getField('code')
    ).alias('code')
)

# Checking patients who received the hepatitis B vaccine
patients_immunizations = patients_immunizations.withColumn(
    'is_vaccinated',
    patients_immunizations.code == IMMUNIZATION_CODE
)

patients_immunizations.show()

In [None]:
###### 3. SELECTING DIABETIC PATIENTS ######

# Joining filtered patients with conditions
conditions = resource_data['Condition']
patients_conditions = filtered_patients.join(
    conditions.select('subject', 'code'),
    filtered_patients.id == extract_subject_id(conditions),
    'left_outer'
)

# Selecting patient id and their condition coding
patients_conditions = patients_conditions.select(
    filtered_patients.id,
    functions.explode_outer(conditions.code.getField('coding')) \
    .alias('codings')
)

# Checking patients who have diabetes
condition_coding = Coding(system='http://snomed.info/sct', code=CONDITION_CODE)
patients_conditions = patients_conditions.withColumn(
    'has_diabetes',
    subsumes(condition_coding, patients_conditions.codings)                                 
)

patients_conditions.show()

In [None]:
###### 4. JOINING THE DIFFERENT DATA SOURCES ######

# Joining vaccine and diabetes dataframes to identify unvaccinated high risk patients
df_a = patients_immunizations.withColumn('id_a', patients_immunizations.id)
df_b = patients_conditions.withColumn('id_b', patients_conditions.id)
two_by_two = df_a.join(df_b, functions.col('id_a') == functions.col('id_b'), 'left_outer')
 
# Aggregating data to get a 2x2 table of diabetes (Y/N) and vaccination (Y/N)
aggregate = two_by_two.groupBy(
    patients_immunizations.is_vaccinated, 
    patients_conditions.has_diabetes
).agg(
    functions.countDistinct(patients_immunizations.id)
)

# Display the results
aggregate.show()

In [None]:
# Optionally, log it as a wandb Table for viewing
table = wandb.Table(dataframe=aggregate.toPandas())

run.log({"hep_b_vaccination_in_diabetics": table})

In [None]:
# Write the results to Minio in Parquet format using Spark
save_artifact(aggregate, PROJECT_NAME, ARTIFACT_NAME, run)

In [None]:
run.finish()