In [2]:
#expected synthea csv files:

#patients.csv
#allergies.csv
#careplans.csv
#conditions.csv
#devices.csv
#encounters.csv
#imaging_studies.csv
#immunizations.csv
#medications.csv
#observations.csv
#procedures.csv

In [1]:
from synthea_functions import *

In [3]:
#directory for synthea output/csv
in_path = "sample_data/"

#write out path
out_path = "PATH_TO/my_data/"

In [4]:
#load synthea tables
#standardise headers
#remove unwanted fields
patient_df, allergy_df, careplan_df, conditions_df, device_df, encounter_df, imaging_study_df, immunization_df, medication_df, observation_df, procedure_df = load_synthea(in_path)

In [5]:
#specify case criteria filters
#example filters:

#filter1: presence of Pulmonary emphysema or Chronic obstructive bronchitis (COPD)
conditions = set(["Pulmonary emphysema (disorder)", "Chronic obstructive bronchitis (disorder)"])
patient_copd = set(conditions_df[conditions_df["DESCRIPTION"].isin(conditions)]["PATIENT"].unique())

#filter2: female
patient_female = set(patient_df[patient_df["GENDER"] == "F"]["PATIENT"])

#filter3: record of covid-19 immunization
patient_covid_jab = set(immunization_df[immunization_df["DESCRIPTION"].str.contains("SARS-COV-2")]["PATIENT"].unique())

In [6]:
help(generate_cohort)

Help on function generate_cohort in module synthea_functions:

generate_cohort(patient_df, *patient_sets)
    return list of patient ids for cohort matching filters
    filters are inclusive: patients must meet all filter criteria
    exclusive cohort can be generated by running multiple filters separately and joining cohorts

    Args:
        patient_df: synthea patient dataframe
        patient_sets: at least one filter patient subset



In [7]:
#find patients matching criteria
case_patients = generate_cohort(patient_df, 
                                #filter patient sets as arguments:
                                patient_copd
                               )

Total patients meeting criteria: 6
Total patients not meeting criteria: 107


In [8]:
help(generate_cohort_sample)

Help on function generate_cohort_sample in module synthea_functions:

generate_cohort_sample(case_N, control_N, case_patients, patient_df)
    return sample of case and control patients

    Args:
        case_N: number of patients in case group
        control_N: number of patients in control
        case_patients: all patients in filter cohort
        patient_df: synthea patient dataframe



In [9]:
#generate cohort as list of patient ids
cohort = generate_cohort_sample(5, 10, case_patients, patient_df)
print("N =", len(cohort))
cohort

N = 15


['a7027663-636a-f19d-ee80-5f489fc3f852',
 '6610278c-a2bc-88c3-3520-75ee83ec2dba',
 'e2aaf7c5-0111-09d3-635d-415b47698451',
 '85f833d7-8a9e-2c5c-9d99-583b7512598b',
 '94eee837-11f5-e5c3-375c-a9cbdf142d93',
 '7f099503-de5a-7861-34cc-9c26226513aa',
 'f9eb008f-7c7b-a8e1-712f-dc839dcd8d88',
 '04bf2325-37b7-d5c9-ae57-e9798e45f5e2',
 'df491e5e-4862-ddc4-55c7-8e6b1affe322',
 '2564e5fd-0052-3626-43d0-f76e85ec3403',
 '46254d13-07eb-7e05-7f6f-b62ddabff2ea',
 'f53300e1-9b12-9072-2cbc-4185feae639b',
 'b8db284c-762f-091e-0151-7635ab4c3904',
 'e4c4bc26-c452-e161-87a9-71ca5c9ba061',
 'babc54fc-b614-0d38-4f67-b561d7e30fef']

In [10]:
#filter tables by cohort membership
#add case flag
#add age at event field
patient_df, allergy_df, careplan_df, conditions_df, device_df, encounter_df, imaging_study_df, immunization_df, medication_df, observation_df, procedure_df = filter_cohort_data(cohort,                                                                                                                                                                                                          case_patients,
                                                                                                                                                                               patient_df,
                                                                                                                                                                               allergy_df,
                                                                                                                                                                               careplan_df,
                                                                                                                                                                               conditions_df,
                                                                                                                                                                               device_df,
                                                                                                                                                                               encounter_df,
                                                                                                                                                                               imaging_study_df,
                                                                                                                                                                               immunization_df,
                                                                                                                                                                               medication_df,
                                                                                                                                                                               observation_df,
                                                                                                                                                                               procedure_df)

In [11]:
#NOTE!! join currently drops any duplicated patient records, keeping only the first instance (per patient not across all data)
#this retains maximum number of unique codes while miniming total size
#this is appropriate for testing integration but does not allow temporal-based queries
#so cannot query recurrent findings or mdeications, and a headache aged 10 might be detected as a diagnostic factor of a brain tumour 50 years later

#join tables (minus patients_df) with table source indicates as CLASS
#DESCRIPTION represents a finding in all classes but observation, where description is a measure and patient measurement given by value/units/type
ehr_df = join_synthea_tables(allergy_df,
                               careplan_df,
                               conditions_df,
                               device_df,
                               encounter_df,
                               imaging_study_df,
                               immunization_df,
                               medication_df,
                               observation_df,
                               procedure_df)

In [12]:
patient_df.head()

Unnamed: 0,PATIENT,BIRTHDATE,DEAD,AGE,FIRST,LAST,MARITAL,RACE,GENDER,CASE
0,04bf2325-37b7-d5c9-ae57-e9798e45f5e2,1996-08-19,0,28,Royce974,Simonis280,S,white,F,0
1,46254d13-07eb-7e05-7f6f-b62ddabff2ea,1976-06-14,1,2,Nora0,Grady603,,asian,F,0
2,df491e5e-4862-ddc4-55c7-8e6b1affe322,1976-06-14,0,48,Crista774,Shanahan202,S,asian,F,0
3,e2aaf7c5-0111-09d3-635d-415b47698451,1955-07-16,1,63,Marlo857,Denesik803,M,white,F,1
4,6610278c-a2bc-88c3-3520-75ee83ec2dba,1973-02-19,0,51,Desmond566,Raynor401,S,black,M,1


In [19]:
ehr_df.head(15)

Unnamed: 0,CLASS,PATIENT,DATE,AGE,SYSTEM,DESCRIPTION,CODE,VALUE,UNITS,TYPE
0,allergy,e2aaf7c5-0111-09d3-635d-415b47698451,1956-04-17,0,SNOMED-CT,Allergy to substance (finding),419199007,,,
1,allergy,e2aaf7c5-0111-09d3-635d-415b47698451,1956-04-17,0,RxNorm,Lisinopril,29046,,,
2,allergy,e2aaf7c5-0111-09d3-635d-415b47698451,1956-04-17,0,SNOMED-CT,Shellfish (substance),735029006,,,
3,allergy,e4c4bc26-c452-e161-87a9-71ca5c9ba061,1994-06-03,0,SNOMED-CT,Allergy to substance (finding),419199007,,,
4,allergy,e4c4bc26-c452-e161-87a9-71ca5c9ba061,1994-06-03,0,RxNorm,Lisinopril,29046,,,
5,allergy,e4c4bc26-c452-e161-87a9-71ca5c9ba061,1994-06-03,0,SNOMED-CT,Wheat (substance),412071004,,,
0,careplan,04bf2325-37b7-d5c9-ae57-e9798e45f5e2,2018-07-28,21,SNOMED-CT,Head injury rehabilitation,47387005,,,
1,careplan,04bf2325-37b7-d5c9-ae57-e9798e45f5e2,2019-08-26,23,SNOMED-CT,Wound care,225358003,,,
2,careplan,df491e5e-4862-ddc4-55c7-8e6b1affe322,2004-08-23,28,SNOMED-CT,Lifestyle education regarding hypertension,443402002,,,
3,careplan,df491e5e-4862-ddc4-55c7-8e6b1affe322,2015-05-10,38,SNOMED-CT,Respiratory therapy,53950000,,,


In [None]:
#write joined cohort files
patient_df.to_csv(out_path + "patients.csv", index=False)
ehr_df.to_csv(out_path + "ehr_records.csv", index=False)

In [31]:
#write all cohort files
patient_df.to_csv(out_path + "patients.csv", index=False)
allergy_df.to_csv(out_path + "allergies.csv", index=False)
careplan_df.to_csv(out_path + "careplans.csv", index=False)
conditions_df.to_csv(out_path + "conditions.csv", index=False)
device_df.to_csv(out_path + "devices.csv", index=False)
encounter_df.to_csv(out_path + "encounters.csv", index=False)
imaging_study_df.to_csv(out_path + "imaging_studies.csv", index=False)
immunization_df.to_csv(out_path + "immunizations.csv", index=False)
medication_df.to_csv(out_path + "medications.csv", index=False)
observation_df.to_csv(out_path + "observations.csv", index=False)
procedure_df.to_csv(out_path + "procedures.csv", index=False)