In [1]:
import pandas as pd
import glob

In [2]:
file_paths = glob.glob(f"./health_data/healthcare-datasets/*.json")
dfs = [pd.read_json(file_path) for file_path in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)
entries = pd.json_normalize(combined_df['entry'])

In [3]:
entries['resource.resourceType'].unique()

array(['Patient', 'Encounter', 'Condition', 'Observation',
       'MedicationRequest', 'Immunization', 'Procedure', 'CarePlan',
       'AllergyIntolerance', 'DiagnosticReport'], dtype=object)

In [4]:
condition_data = entries[entries['resource.resourceType'] == 'Condition']
condition_data.reset_index(drop=True,inplace=True)
condition_data.head()

Unnamed: 0,fullUrl,resource.id,resource.text.status,resource.text.div,resource.extension,resource.identifier,resource.name,resource.telecom,resource.gender,resource.birthDate,...,resource.deceasedDateTime,resource.valueCodeableConcept.coding,resource.dispenseRequest.numberOfRepeatsAllowed,resource.dispenseRequest.quantity.value,resource.dispenseRequest.quantity.unit,resource.dispenseRequest.expectedSupplyDuration.value,resource.dispenseRequest.expectedSupplyDuration.unit,resource.dispenseRequest.expectedSupplyDuration.system,resource.dispenseRequest.expectedSupplyDuration.code,resource.multipleBirthInteger
0,urn:uuid:dff431d5-79b9-40c0-91b6-e3032cf93a20,dff431d5-79b9-40c0-91b6-e3032cf93a20,,,,,,,,,...,,,,,,,,,,
1,urn:uuid:be9865e1-76d5-4e23-b94d-7e3da3c0e762,be9865e1-76d5-4e23-b94d-7e3da3c0e762,,,,,,,,,...,,,,,,,,,,
2,urn:uuid:3f257d68-ea56-42b5-b926-74165e287c62,3f257d68-ea56-42b5-b926-74165e287c62,,,,,,,,,...,,,,,,,,,,
3,urn:uuid:bda57d2c-732a-4f5b-a0bc-f9b49ca45cb0,bda57d2c-732a-4f5b-a0bc-f9b49ca45cb0,,,,,,,,,...,,,,,,,,,,
4,urn:uuid:41b412e2-fc7f-4f0d-8a31-cf793c1426e5,41b412e2-fc7f-4f0d-8a31-cf793c1426e5,,,,,,,,,...,,,,,,,,,,


In [5]:
condition_id = condition_data['resource.id']
condition_id

0      dff431d5-79b9-40c0-91b6-e3032cf93a20
1      be9865e1-76d5-4e23-b94d-7e3da3c0e762
2      3f257d68-ea56-42b5-b926-74165e287c62
3      bda57d2c-732a-4f5b-a0bc-f9b49ca45cb0
4      41b412e2-fc7f-4f0d-8a31-cf793c1426e5
                       ...                 
146    ff5133e8-6d61-46f4-aa79-446533960eac
147    e827d0e4-a1d9-49e3-9edf-038319e25ed3
148    fa532314-d31b-461c-b216-d24c99c6d27c
149    c3652015-baae-41ca-b044-84c2d9e252df
150    3a043586-6429-49e3-8bea-1db4ba5caed9
Name: resource.id, Length: 151, dtype: object

In [8]:
condition_clinical_status = condition_data.get('resource.clinicalStatus',None)
condition_clinical_status

0      active
1      active
2      active
3      active
4      active
        ...  
146    active
147    active
148    active
149    active
150    active
Name: resource.clinicalStatus, Length: 151, dtype: object

In [9]:
condition_verification_status = condition_data.get('resource.verificationStatus',None)
condition_verification_status

0      confirmed
1      confirmed
2      confirmed
3      confirmed
4      confirmed
         ...    
146    confirmed
147    confirmed
148    confirmed
149    confirmed
150    confirmed
Name: resource.verificationStatus, Length: 151, dtype: object

In [11]:
condition_name_raw = condition_data.get('resource.code.coding',None)
if not condition_name_raw is None:
    condition_name = pd.json_normalize(condition_name_raw.explode()).get('display',None)
else:
    condition_name = None
condition_name


0       Streptococcal sore throat (disorder)
1         Acute viral pharyngitis (disorder)
2                 Viral sinusitis (disorder)
3                Acute bronchitis (disorder)
4                           Normal pregnancy
                       ...                  
146                             Hypertension
147    Concussion with loss of consciousness
148               Viral sinusitis (disorder)
149                      Fracture of forearm
150               Viral sinusitis (disorder)
Name: display, Length: 151, dtype: object

In [12]:
condition_onset_date_time = condition_data.get('resource.onsetDateTime',None)
condition_onset_date_time

0      2010-07-31T13:21:25-04:00
1      2010-09-13T01:03:45-04:00
2      2011-10-24T04:05:30-04:00
3      2015-01-05T19:01:24-05:00
4      2015-05-08T20:38:47-04:00
                 ...            
146    1997-06-15T17:23:53-04:00
147    2009-12-24T08:58:07-05:00
148    2012-02-17T18:52:24-05:00
149    2014-02-07T22:43:10-05:00
150    2014-02-05T10:51:00-05:00
Name: resource.onsetDateTime, Length: 151, dtype: object

In [13]:
condition_abatement_date_time = condition_data.get('resource.abatementDateTime',None)
condition_abatement_date_time


0      2010-08-13T01:03:45-04:00
1      2010-09-25T17:29:09-04:00
2      2011-11-12T01:05:15-05:00
3      2015-01-28T12:23:23-05:00
4      2015-12-04T20:38:47-05:00
                 ...            
146                          NaN
147    2010-03-06T03:41:56-05:00
148    2012-03-05T10:51:00-05:00
149    2014-05-18T12:43:32-04:00
150    2014-03-02T21:47:33-05:00
Name: resource.abatementDateTime, Length: 151, dtype: object

In [14]:
condition_patient_id = condition_data['resource.subject.reference'].str.replace('urn:uuid:','')
condition_patient_id

0      65d12976-9588-4cfa-a795-216302a2ece9
1      65d12976-9588-4cfa-a795-216302a2ece9
2      65d12976-9588-4cfa-a795-216302a2ece9
3      65d12976-9588-4cfa-a795-216302a2ece9
4      65d12976-9588-4cfa-a795-216302a2ece9
                       ...                 
146    a5399e95-981c-4a78-8019-873662fc7901
147    a5399e95-981c-4a78-8019-873662fc7901
148    a5399e95-981c-4a78-8019-873662fc7901
149    a5399e95-981c-4a78-8019-873662fc7901
150    a5399e95-981c-4a78-8019-873662fc7901
Name: resource.subject.reference, Length: 151, dtype: object

In [15]:
condition_encounter_id  = condition_data['resource.context.reference'].str.replace('urn:uuid:','')
condition_encounter_id

0      a3e3bd43-ef3c-48fd-9851-5302d1aa84c6
1      7e969f44-f034-48c1-82c9-0fcf4b24e20c
2      8c679a80-a381-4fa8-af8a-31cb44d65b7b
3      ddc2fb3c-6ea7-4b66-931b-0a0b4bfd3a75
4      80fd8fc5-4770-4e35-b31c-4d7b1be19c2f
                       ...                 
146    14e39e12-688a-4a2d-942d-d761cff4123e
147    14e39e12-688a-4a2d-942d-d761cff4123e
148    cb364aed-12ed-4c24-94b4-56e829635d72
149    7f88e6e7-b838-497a-86d6-aaf35b8cdf29
150    7f88e6e7-b838-497a-86d6-aaf35b8cdf29
Name: resource.context.reference, Length: 151, dtype: object

In [16]:
result_df = pd.DataFrame({
    'Condition ID':condition_id,
    'Clinical Status':condition_clinical_status,
    'Verification Status':condition_verification_status,
    'Condition Name':condition_name,
    'Patient ID':condition_patient_id,
    'Encounter ID':condition_encounter_id,
    'Onset Date Time':condition_onset_date_time,
    'Abatement Date Time':condition_abatement_date_time
})
result_df.head()

Unnamed: 0,Condition ID,Clinical Status,Verification Status,Condition Name,Patient ID,Encounter ID,Onset Date Time,Abatement Date Time
0,dff431d5-79b9-40c0-91b6-e3032cf93a20,active,confirmed,Streptococcal sore throat (disorder),65d12976-9588-4cfa-a795-216302a2ece9,a3e3bd43-ef3c-48fd-9851-5302d1aa84c6,2010-07-31T13:21:25-04:00,2010-08-13T01:03:45-04:00
1,be9865e1-76d5-4e23-b94d-7e3da3c0e762,active,confirmed,Acute viral pharyngitis (disorder),65d12976-9588-4cfa-a795-216302a2ece9,7e969f44-f034-48c1-82c9-0fcf4b24e20c,2010-09-13T01:03:45-04:00,2010-09-25T17:29:09-04:00
2,3f257d68-ea56-42b5-b926-74165e287c62,active,confirmed,Viral sinusitis (disorder),65d12976-9588-4cfa-a795-216302a2ece9,8c679a80-a381-4fa8-af8a-31cb44d65b7b,2011-10-24T04:05:30-04:00,2011-11-12T01:05:15-05:00
3,bda57d2c-732a-4f5b-a0bc-f9b49ca45cb0,active,confirmed,Acute bronchitis (disorder),65d12976-9588-4cfa-a795-216302a2ece9,ddc2fb3c-6ea7-4b66-931b-0a0b4bfd3a75,2015-01-05T19:01:24-05:00,2015-01-28T12:23:23-05:00
4,41b412e2-fc7f-4f0d-8a31-cf793c1426e5,active,confirmed,Normal pregnancy,65d12976-9588-4cfa-a795-216302a2ece9,80fd8fc5-4770-4e35-b31c-4d7b1be19c2f,2015-05-08T20:38:47-04:00,2015-12-04T20:38:47-05:00


## Condition Data Extraction

In [17]:
import pandas as pd
import glob

In [18]:
def extract_condition_data(root_directory_path):
    file_paths = glob.glob(f"{root_directory_path}*.json")
    dfs = [pd.read_json(file_path) for file_path in file_paths]
    combined_df = pd.concat(dfs, ignore_index=True)
    entries = pd.json_normalize(combined_df['entry'])

    condition_data = entries[entries['resource.resourceType'] == 'Condition']
    condition_data.reset_index(drop=True,inplace=True)

    condition_id = condition_data['resource.id']
    condition_clinical_status = condition_data.get('resource.clinicalStatus',None)
    condition_verification_status = condition_data.get('resource.verificationStatus',None)
    condition_name_raw = condition_data.get('resource.code.coding',None)
    if not condition_name_raw is None:
        condition_name = pd.json_normalize(condition_name_raw.explode()).get('display',None)
    else:
        condition_name = None
    condition_onset_date_time = condition_data.get('resource.onsetDateTime',None)
    condition_abatement_date_time = condition_data.get('resource.abatementDateTime',None)
    condition_patient_id = condition_data['resource.subject.reference'].str.replace('urn:uuid:','')
    condition_encounter_id  = condition_data['resource.context.reference'].str.replace('urn:uuid:','')

    result_df = pd.DataFrame({
    'Condition ID':condition_id,
    'Clinical Status':condition_clinical_status,
    'Verification Status':condition_verification_status,
    'Condition Name':condition_name,
    'Patient ID':condition_patient_id,
    'Encounter ID':condition_encounter_id,
    'Onset Date Time':condition_onset_date_time,
    'Abatement Date Time':condition_abatement_date_time
    })

    result_df.to_csv('./output_csv/condition_data.csv',index=False)

In [19]:
extract_condition_data('./health_data/healthcare-datasets/')

## Process Condition Data

In [20]:
import pandas as pd
condition = pd.read_csv('./output_csv/condition_data.csv')

In [21]:
condition.head()

Unnamed: 0,Condition ID,Clinical Status,Verification Status,Condition Name,Patient ID,Encounter ID,Onset Date Time,Abatement Date Time
0,dff431d5-79b9-40c0-91b6-e3032cf93a20,active,confirmed,Streptococcal sore throat (disorder),65d12976-9588-4cfa-a795-216302a2ece9,a3e3bd43-ef3c-48fd-9851-5302d1aa84c6,2010-07-31T13:21:25-04:00,2010-08-13T01:03:45-04:00
1,be9865e1-76d5-4e23-b94d-7e3da3c0e762,active,confirmed,Acute viral pharyngitis (disorder),65d12976-9588-4cfa-a795-216302a2ece9,7e969f44-f034-48c1-82c9-0fcf4b24e20c,2010-09-13T01:03:45-04:00,2010-09-25T17:29:09-04:00
2,3f257d68-ea56-42b5-b926-74165e287c62,active,confirmed,Viral sinusitis (disorder),65d12976-9588-4cfa-a795-216302a2ece9,8c679a80-a381-4fa8-af8a-31cb44d65b7b,2011-10-24T04:05:30-04:00,2011-11-12T01:05:15-05:00
3,bda57d2c-732a-4f5b-a0bc-f9b49ca45cb0,active,confirmed,Acute bronchitis (disorder),65d12976-9588-4cfa-a795-216302a2ece9,ddc2fb3c-6ea7-4b66-931b-0a0b4bfd3a75,2015-01-05T19:01:24-05:00,2015-01-28T12:23:23-05:00
4,41b412e2-fc7f-4f0d-8a31-cf793c1426e5,active,confirmed,Normal pregnancy,65d12976-9588-4cfa-a795-216302a2ece9,80fd8fc5-4770-4e35-b31c-4d7b1be19c2f,2015-05-08T20:38:47-04:00,2015-12-04T20:38:47-05:00


In [22]:
condition.describe()

Unnamed: 0,Condition ID,Clinical Status,Verification Status,Condition Name,Patient ID,Encounter ID,Onset Date Time,Abatement Date Time
count,151,151,151,151,151,151,151,91
unique,151,1,1,48,34,106,148,89
top,dff431d5-79b9-40c0-91b6-e3032cf93a20,active,confirmed,Viral sinusitis (disorder),aa3973d9-b64f-4a36-8cb6-f2719080b52f,fee0f09f-2c53-46e9-a50f-fe69d3238996,2011-06-15T04:14:23-04:00,2013-06-18T19:56:52-04:00
freq,1,151,151,22,10,7,2,2


In [26]:
condition.groupby('Patient ID')['Condition ID'].nunique()

Patient ID
00d869e2-6793-4ebf-9340-38bf18d223c4     2
117335bb-33ff-4897-ac27-af4b3f11135f     5
1694897b-912e-401f-a135-aa59857a401e     4
1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b     4
21e19329-b572-414b-9cd1-ecf96af4f568     3
2db81051-c02a-4c72-b17d-feb8c49cded7     2
2f3f9a7e-8142-40c8-8c4a-b58bc815b48d     6
32c72cdb-66bf-4301-895e-16da54ffd72d     3
3a5357d9-13f3-42de-8fdc-14a1a6cae148     3
509474a0-baf3-48f2-a595-67b6e2b34c87     4
5265a2b0-1bbc-4993-9042-b8d86ba65f10     2
65d12976-9588-4cfa-a795-216302a2ece9     5
65dfe191-c7ef-4148-8f12-49f832525d4c     2
7384d82c-2d1e-4595-99e4-f0ae962dddf1     9
80bb41d3-ebb4-42e6-b087-6ee89b0463a9     1
83044f55-e41f-4489-a0b8-f69bd6c7b1d4     7
8b47143f-2d3b-4425-a872-569af4ec0818     2
95ad0405-ec9e-4464-a708-750296dc1548     1
99937389-654f-4a48-8e74-ac81cf260e0f     6
9bf6df85-12db-49d1-b754-b5adcdbb6115     4
a0dcd747-1d40-40fd-8e42-1f9395f034ca     8
a195633e-d36d-4abe-92bb-02994e9cb348     5
a5399e95-981c-4a78-8019-873662fc7901     5


In [31]:
pd.set_option('display.max_colwidth',90)
test = condition.groupby('Patient ID').agg({
    'Condition ID':lambda x : list(x.unique()),
})
test['condition count'] = condition.groupby('Patient ID')['Condition ID'].nunique()
test

Unnamed: 0_level_0,Condition ID,condition count
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1
00d869e2-6793-4ebf-9340-38bf18d223c4,"[ac19c6a8-3fef-4d1f-bf34-dd04b49bff08, 3933d233-e42b-446f-b6d5-2124f7d650e6]",2
117335bb-33ff-4897-ac27-af4b3f11135f,"[55b30910-470a-4d6c-b56d-aa699cc9156b, 1079f9a4-52a7-4817-9e32-bad6de7ce87d, 85d0c325-...",5
1694897b-912e-401f-a135-aa59857a401e,"[6434e601-c857-42ce-8e31-44cecaaf339d, 0224be83-6997-4803-add1-56f04219d779, 663b9e49-...",4
1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b,"[2af9dbad-2c1c-470d-8bec-ad650890b1d3, 1cac2d6a-34c2-4ff5-ad0a-a50e5ea39752, a552ee4e-...",4
21e19329-b572-414b-9cd1-ecf96af4f568,"[441115a2-353d-444a-820e-30c124efc0b7, 16d98a1d-ba9d-450e-b7ce-889218ac9b93, ec5aec6e-...",3
2db81051-c02a-4c72-b17d-feb8c49cded7,"[a3b890c1-948a-4123-841d-cbe76443ecba, 4e0c6c9c-44ee-4d17-89b1-d9e9e00a87bd]",2
2f3f9a7e-8142-40c8-8c4a-b58bc815b48d,"[1e194f7f-32c6-49b5-bc9b-318abf331d4a, 4925052d-c697-4537-9152-a16a8061e8bb, f1afb4ed-...",6
32c72cdb-66bf-4301-895e-16da54ffd72d,"[4c9209c1-9145-49d8-9dbc-87dce876ef64, 5e7c8a39-5a49-4c2a-b306-745d6d445df2, 02200242-...",3
3a5357d9-13f3-42de-8fdc-14a1a6cae148,"[08def035-2c6d-4947-991f-0c09538d16b4, 5d2d33e7-36c5-46c6-8241-e653a78486b3, 855eadd7-...",3
509474a0-baf3-48f2-a595-67b6e2b34c87,"[61e4b744-f0c6-467d-bcaf-2a1cf1b5c134, 03fe68de-4165-4514-8f38-91b9b03d9d26, 2114bfc8-...",4
