In [1]:
#expected synthea csv files:

#patients.csv
#allergies.csv
#careplans.csv
#conditions.csv
#devices.csv
#encounters.csv
#imaging_studies.csv
#immunizations.csv
#medications.csv
#observations.csv
#procedures.csv

In [1]:
from synthea_functions import *

In [2]:
#directory for synthea output/csv
in_path = "sample_data/"

#write out path
out_path = "PATH_TO/my_data/"

In [3]:
#load synthea tables
#standardise headers
#remove unwanted fields
patient_df, allergy_df, careplan_df, conditions_df, device_df, encounter_df, imaging_study_df, \
immunization_df, medication_df, observation_df, procedure_df = load_synthea(in_path)

In [4]:
#specify case criteria filters
#example filters:

#filter1: presence of Pulmonary emphysema or Chronic obstructive bronchitis (COPD)
conditions = set(["Pulmonary emphysema (disorder)", "Chronic obstructive bronchitis (disorder)"])
patient_copd = set(conditions_df[conditions_df["DESCRIPTION"].isin(conditions)]["PATIENT"])

#filter2: female
patient_female = set(patient_df[patient_df["GENDER"] == "F"]["PATIENT"])

#filter3: record of covid-19 immunization
patient_covid_jab = set(immunization_df[immunization_df["DESCRIPTION"].str.contains("SARS-COV-2")]["PATIENT"])

In [5]:
help(generate_cohort)

Help on function generate_cohort in module synthea_functions:

generate_cohort(patient_df, *patient_sets)
    return list of patient ids for cohort matching filters
    filters are inclusive: patients must meet all filter criteria
    exclusive cohort can be generated by running multiple filters separately and joining cohorts

    Args:
        patient_df: synthea patient dataframe
        patient_sets: at least one filter patient subset



In [6]:
#find patients matching criteria
case_patients = generate_cohort(patient_df, 
                                #filter patient sets as arguments:
                                patient_copd
                               )

Total patients meeting criteria: 6
Total patients not meeting criteria: 107


In [7]:
help(generate_cohort_sample)

Help on function generate_cohort_sample in module synthea_functions:

generate_cohort_sample(case_N, control_N, case_patients, patient_df)
    return sample of case and control patients

    Args:
        case_N: number of patients in case group
        control_N: number of patients in control
        case_patients: all patients in filter cohort
        patient_df: synthea patient dataframe



In [8]:
#generate cohort as list of patient ids
cohort = generate_cohort_sample(5, 10, case_patients, patient_df)
print("N =", len(cohort))
cohort

N = 15


['f14616e7-ce9c-c83a-25f5-43d2aa2d419e',
 '6610278c-a2bc-88c3-3520-75ee83ec2dba',
 'e2aaf7c5-0111-09d3-635d-415b47698451',
 'a7027663-636a-f19d-ee80-5f489fc3f852',
 '85f833d7-8a9e-2c5c-9d99-583b7512598b',
 'cfc40ec8-a833-bc50-fb63-670a07b24f1f',
 'd1a6f11a-eb7b-1b69-3dcb-c6b14a785c1b',
 '65c9521d-f1c5-cd67-5fc3-c973d700d5d8',
 '1db976ed-12be-c7cd-7b51-466774a2ca90',
 'ce3ed294-0ab4-e15c-9fc1-a08e6b3fa419',
 '6294053e-da84-702d-4456-f34524d6813b',
 '46254d13-07eb-7e05-7f6f-b62ddabff2ea',
 '58d8dbb2-d3e2-1052-0507-a58ce50093df',
 'c90b29fd-e413-c728-54c9-3dfa2f8c9393',
 '7e3edb32-9c04-6cd2-beb3-125042e22d1f']

In [9]:
#filter tables by cohort membership
#add case flag
#add age at event field
patient_df, allergy_df, careplan_df, conditions_df, device_df, encounter_df, imaging_study_df, \
immunization_df, medication_df, observation_df, procedure_df = filter_cohort_data(cohort,
                                                                                case_patients,
                                                                                patient_df,
                                                                                allergy_df,
                                                                                careplan_df,
                                                                                conditions_df,
                                                                                device_df,
                                                                                encounter_df,
                                                                                imaging_study_df,
                                                                                immunization_df,
                                                                                medication_df,
                                                                                observation_df,
                                                                                procedure_df
                                                                                )

In [10]:
#NOTE!! join currently drops any duplicated patient records, keeping only the first instance (per patient not across all data)
#this retains maximum number of unique codes while minimising total size
#this is appropriate for testing integration but does not allow temporal-based queries
#so cannot query recurrent findings or medications, and a headache aged 10 might be detected as a diagnostic factor of a brain tumour 50 years later

#join tables (minus patients_df) with table source indicated as CLASS
#DESCRIPTION represents a finding in all classes but observation, where description is a measure and patient measurement given by value/units/type
ehr_df = join_synthea_tables(allergy_df,
                               careplan_df,
                               conditions_df,
                               device_df,
                               encounter_df,
                               imaging_study_df,
                               immunization_df,
                               medication_df,
                               observation_df,
                               procedure_df)

In [11]:
patient_df.head()

Unnamed: 0,PATIENT,BIRTHDATE,DEAD,AGE,FIRST,LAST,MARITAL,RACE,GENDER,CASE
0,1db976ed-12be-c7cd-7b51-466774a2ca90,1973-10-13,0,51,Delpha270,Rohan584,M,white,F,0
1,46254d13-07eb-7e05-7f6f-b62ddabff2ea,1976-06-14,1,2,Nora0,Grady603,,asian,F,0
2,e2aaf7c5-0111-09d3-635d-415b47698451,1955-07-16,1,63,Marlo857,Denesik803,M,white,F,1
3,c90b29fd-e413-c728-54c9-3dfa2f8c9393,1999-07-23,0,25,Romeo514,Stroman228,,white,M,0
4,ce3ed294-0ab4-e15c-9fc1-a08e6b3fa419,1980-08-23,0,44,Devin82,O'Reilly797,M,white,F,0


In [12]:
ehr_df.head(15)

Unnamed: 0,CLASS,PATIENT,DATE,AGE,SYSTEM,DESCRIPTION,CODE,VALUE,UNITS,TYPE
0,allergy,1db976ed-12be-c7cd-7b51-466774a2ca90,1974-07-25,0,SNOMED-CT,Allergy to substance (finding),419199007,,,
1,allergy,1db976ed-12be-c7cd-7b51-466774a2ca90,1974-07-25,0,RxNorm,Lisinopril,29046,,,
2,allergy,1db976ed-12be-c7cd-7b51-466774a2ca90,1974-07-25,0,SNOMED-CT,Wheat (substance),412071004,,,
3,allergy,e2aaf7c5-0111-09d3-635d-415b47698451,1956-04-17,0,SNOMED-CT,Allergy to substance (finding),419199007,,,
4,allergy,e2aaf7c5-0111-09d3-635d-415b47698451,1956-04-17,0,RxNorm,Lisinopril,29046,,,
5,allergy,e2aaf7c5-0111-09d3-635d-415b47698451,1956-04-17,0,SNOMED-CT,Shellfish (substance),735029006,,,
0,careplan,1db976ed-12be-c7cd-7b51-466774a2ca90,1974-07-09,0,SNOMED-CT,Self-care interventions (procedure),384758001,,,
1,careplan,1db976ed-12be-c7cd-7b51-466774a2ca90,2016-04-16,42,SNOMED-CT,Routine antenatal care,134435003,,,
2,careplan,1db976ed-12be-c7cd-7b51-466774a2ca90,2021-11-06,48,SNOMED-CT,Lifestyle education regarding hypertension,443402002,,,
3,careplan,e2aaf7c5-0111-09d3-635d-415b47698451,1956-04-06,0,SNOMED-CT,Self-care interventions (procedure),384758001,,,


In [None]:
#write joined cohort files
patient_df.to_csv(out_path + "patients.csv", index=False)
ehr_df.to_csv(out_path + "ehr_records.csv", index=False)

In [None]:
#write all cohort files
patient_df.to_csv(out_path + "patients.csv", index=False)
allergy_df.to_csv(out_path + "allergies.csv", index=False)
careplan_df.to_csv(out_path + "careplans.csv", index=False)
conditions_df.to_csv(out_path + "conditions.csv", index=False)
device_df.to_csv(out_path + "devices.csv", index=False)
encounter_df.to_csv(out_path + "encounters.csv", index=False)
imaging_study_df.to_csv(out_path + "imaging_studies.csv", index=False)
immunization_df.to_csv(out_path + "immunizations.csv", index=False)
medication_df.to_csv(out_path + "medications.csv", index=False)
observation_df.to_csv(out_path + "observations.csv", index=False)
procedure_df.to_csv(out_path + "procedures.csv", index=False)