In [1]:
import pandas as pd
import glob

In [2]:
file_paths = glob.glob(f"./health_data/healthcare-datasets/*.json")
dfs = [pd.read_json(file_path) for file_path in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)
entries = pd.json_normalize(combined_df['entry'])

In [3]:
entries['resource.resourceType'].unique()

array(['Patient', 'Encounter', 'Condition', 'Observation', 'Immunization',
       'CarePlan', 'Procedure', 'DiagnosticReport', 'MedicationRequest',
       'AllergyIntolerance'], dtype=object)

In [4]:
observation_data = entries[entries['resource.resourceType'] == 'Observation']
observation_data.reset_index(drop=True,inplace=True)
observation_data.head()

Unnamed: 0,fullUrl,resource.id,resource.text.status,resource.text.div,resource.extension,resource.identifier,resource.name,resource.telecom,resource.gender,resource.birthDate,...,resource.dispenseRequest.numberOfRepeatsAllowed,resource.dispenseRequest.quantity.value,resource.dispenseRequest.quantity.unit,resource.dispenseRequest.expectedSupplyDuration.value,resource.dispenseRequest.expectedSupplyDuration.unit,resource.dispenseRequest.expectedSupplyDuration.system,resource.dispenseRequest.expectedSupplyDuration.code,resource.criticality,resource.assertedDate,resource.multipleBirthInteger
0,urn:uuid:1f954180-ec49-455a-95aa-3b7414ffeab6,1f954180-ec49-455a-95aa-3b7414ffeab6,,,,,,,,,...,,,,,,,,,,
1,urn:uuid:556966e3-42a8-4359-838a-8771dd2a2a23,556966e3-42a8-4359-838a-8771dd2a2a23,,,,,,,,,...,,,,,,,,,,
2,urn:uuid:5e1608af-47b2-4990-af09-50d3ca7a62b9,5e1608af-47b2-4990-af09-50d3ca7a62b9,,,,,,,,,...,,,,,,,,,,
3,urn:uuid:7b141a9c-6c37-4d06-8eb8-4c61c5ca4bf0,7b141a9c-6c37-4d06-8eb8-4c61c5ca4bf0,,,,,,,,,...,,,,,,,,,,
4,urn:uuid:cac21874-494f-49f6-8074-1d517a62a7d3,cac21874-494f-49f6-8074-1d517a62a7d3,,,,,,,,,...,,,,,,,,,,


In [7]:
observation_id = observation_data.get('resource.id',None)
observation_id

0       1f954180-ec49-455a-95aa-3b7414ffeab6
1       556966e3-42a8-4359-838a-8771dd2a2a23
2       5e1608af-47b2-4990-af09-50d3ca7a62b9
3       7b141a9c-6c37-4d06-8eb8-4c61c5ca4bf0
4       cac21874-494f-49f6-8074-1d517a62a7d3
                        ...                 
1305    e4e56824-be5d-40f5-800e-052841b16eea
1306    b7b2e34e-b5a2-4879-9305-e20d9df9d6f5
1307    ed9fbc19-242d-4044-8900-1e43f62dd116
1308    1102d6e4-724c-4593-bbc4-81416860431f
1309    6643ee4f-9fa9-430f-a456-a2e39ebc7bbd
Name: resource.id, Length: 1310, dtype: object

In [8]:
observation_status = observation_data.get('resource.status',None)
observation_status

0       final
1       final
2       final
3       final
4       final
        ...  
1305    final
1306    final
1307    final
1308    final
1309    final
Name: resource.status, Length: 1310, dtype: object

In [9]:
observation_type_raw = observation_data.get('resource.code.coding',None)
if not observation_type_raw is None:
    display(observation_type_raw)
    observation_type = pd.json_normalize(observation_type_raw.explode()).get('display',None)
    display(observation_type)
else:
    observation_type = None

0       [{'system': 'http://loinc.org', 'code': '8302-...
1       [{'system': 'http://loinc.org', 'code': '29463...
2       [{'system': 'http://loinc.org', 'code': '39156...
3       [{'system': 'http://loinc.org', 'code': '55284...
4       [{'system': 'http://loinc.org', 'code': '8302-...
                              ...                        
1305    [{'system': 'http://loinc.org', 'code': '2571-...
1306    [{'system': 'http://loinc.org', 'code': '18262...
1307    [{'system': 'http://loinc.org', 'code': '2085-...
1308    [{'system': 'http://loinc.org', 'code': '14959...
1309    [{'system': 'http://loinc.org', 'code': '33914...
Name: resource.code.coding, Length: 1310, dtype: object

0                                Body Height
1                                Body Weight
2                            Body Mass Index
3                             Blood Pressure
4                                Body Height
                        ...                 
1305                           Triglycerides
1306     Low Density Lipoprotein Cholesterol
1307    High Density Lipoprotein Cholesterol
1308             Microalbumin Creatine Ratio
1309    Estimated Glomerular Filtration Rate
Name: display, Length: 1310, dtype: object

In [10]:
observation_data.columns

Index(['fullUrl', 'resource.id', 'resource.text.status', 'resource.text.div',
       'resource.extension', 'resource.identifier', 'resource.name',
       'resource.telecom', 'resource.gender', 'resource.birthDate',
       'resource.address', 'resource.maritalStatus.coding',
       'resource.multipleBirthBoolean', 'resource.photo',
       'resource.resourceType', 'resource.status', 'resource.class.code',
       'resource.type', 'resource.patient.reference', 'resource.period.start',
       'resource.period.end', 'resource.clinicalStatus',
       'resource.verificationStatus', 'resource.code.coding',
       'resource.subject.reference', 'resource.context.reference',
       'resource.onsetDateTime', 'resource.abatementDateTime',
       'resource.encounter.reference', 'resource.effectiveDateTime',
       'resource.valueQuantity.value', 'resource.valueQuantity.unit',
       'resource.valueQuantity.system', 'resource.valueQuantity.code',
       'resource.component', 'resource.date', 'resource

In [11]:
observation_value = observation_data.get('resource.valueQuantity.value',None).apply(lambda x: round(x,1))
display(len(observation_value))
observation_unit = observation_data.get('resource.valueQuantity.unit',None)
display(len(observation_unit))
if not observation_value is None:
    observation_value_unit = observation_value.astype('str').str.cat(observation_unit,sep=' ')
    display(observation_value_unit)
else:
    observation_value_unit = None

1310

1310

0                   158.9 cm
1                    98.0 kg
2                 38.8 kg/m2
3                        NaN
4                   158.9 cm
                ...         
1305             100.0 mg/dL
1306              91.0 mg/dL
1307              75.0 mg/dL
1308               12.0 mg/g
1309    0.0 mL/min/{1.73_m2}
Name: resource.valueQuantity.value, Length: 1310, dtype: object

In [12]:
len(observation_value_unit[observation_value_unit.isna()])

163

In [13]:
observation_date_time = observation_data.get('resource.effectiveDateTime',None)
observation_date_time

0       2010-05-12T10:33:51-04:00
1       2010-05-12T10:33:51-04:00
2       2010-05-12T10:33:51-04:00
3       2010-05-12T10:33:51-04:00
4       2011-07-02T14:56:51-04:00
                  ...            
1305    2017-01-25T15:04:04-05:00
1306    2017-01-25T15:04:04-05:00
1307    2017-01-25T15:04:04-05:00
1308    2017-01-25T15:04:04-05:00
1309    2017-01-25T15:04:04-05:00
Name: resource.effectiveDateTime, Length: 1310, dtype: object

In [14]:
observation_encounter_id = observation_data['resource.encounter.reference'].str.replace('urn:uuid:','')
observation_patient_id = observation_data['resource.subject.reference'].str.replace('urn:uuid:','')

result

In [15]:
result_df = pd.DataFrame({
    'Observation ID':observation_id,
    'Status':observation_status,
    'Patient ID':observation_patient_id,
    'Encounter ID':observation_encounter_id,
    'Observation Date Time':observation_date_time,
    'Observation Type':observation_type,
    'Observation Value':observation_value_unit
})
result_df.head()

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value
0,1f954180-ec49-455a-95aa-3b7414ffeab6,final,80bb41d3-ebb4-42e6-b087-6ee89b0463a9,2c722ad1-5a59-4f6a-90cc-c55f86bbd9c6,2010-05-12T10:33:51-04:00,Body Height,158.9 cm
1,556966e3-42a8-4359-838a-8771dd2a2a23,final,80bb41d3-ebb4-42e6-b087-6ee89b0463a9,2c722ad1-5a59-4f6a-90cc-c55f86bbd9c6,2010-05-12T10:33:51-04:00,Body Weight,98.0 kg
2,5e1608af-47b2-4990-af09-50d3ca7a62b9,final,80bb41d3-ebb4-42e6-b087-6ee89b0463a9,2c722ad1-5a59-4f6a-90cc-c55f86bbd9c6,2010-05-12T10:33:51-04:00,Body Mass Index,38.8 kg/m2
3,7b141a9c-6c37-4d06-8eb8-4c61c5ca4bf0,final,80bb41d3-ebb4-42e6-b087-6ee89b0463a9,2c722ad1-5a59-4f6a-90cc-c55f86bbd9c6,2010-05-12T10:33:51-04:00,Blood Pressure,
4,cac21874-494f-49f6-8074-1d517a62a7d3,final,80bb41d3-ebb4-42e6-b087-6ee89b0463a9,f5b01614-f569-4e9e-b482-cc0231a7c20a,2011-07-02T14:56:51-04:00,Body Height,158.9 cm


In [16]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Observation ID         1310 non-null   object
 1   Status                 1310 non-null   object
 2   Patient ID             1310 non-null   object
 3   Encounter ID           1310 non-null   object
 4   Observation Date Time  1310 non-null   object
 5   Observation Type       1310 non-null   object
 6   Observation Value      1147 non-null   object
dtypes: object(7)
memory usage: 71.8+ KB


In [39]:
len(result_df['Encounter ID'].unique())

189

In [46]:
result_df.describe()

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value
count,1310,1310,1310,1310,1310,1310,1147
unique,1310,1,34,189,189,31,459
top,1f954180-ec49-455a-95aa-3b7414ffeab6,final,aa3973d9-b64f-4a36-8cb6-f2719080b52f,5b6a51a4-5c13-42b0-b885-8936c9cc36f3,2017-01-25T15:04:04-05:00,Body Height,1.0 mg/dL
freq,1,1310,181,20,20,159,45


diagnostic data

In [17]:
diagnostic = pd.read_csv('./output_csv/diagnostic_data.csv')

In [18]:
diagnostic[20:30]

Unnamed: 0,Diagnostic Report ID,Status,Diagnostic Name,Patient ID,Encounter ID,Diagnostic Date Time,Performer,result_reference,result_display
20,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:ad0ecd5a-ca4c-41ef-a8a9-0d89218e1a10,Glucose
21,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:516fecff-8eea-411e-8fcd-0ce54d58902e,Urea Nitrogen
22,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:b2b275d5-5156-420c-a4a5-ade863beb47c,Creatinine
23,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:5b67e72a-6e74-4396-a98b-423ec1066e72,Calcium
24,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:6b190b59-defc-437b-ac58-83df01d52049,Sodium
25,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:10b8d1d7-aedb-4511-8636-34b4b16add1a,Potassium
26,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:7fc0d3fa-58cc-4cfc-803c-25d0f857ff93,Chloride
27,18cb1577-3300-47b2-8d57-38f9cd8b950b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,2012-10-02T21:50:16-04:00,Hospital Lab,urn:uuid:910c806a-5969-46a6-a770-45ef85b20021,Carbon Dioxide
28,d79b10af-0929-46a2-a880-494d4affbba8,final,Lipid Panel,d76cce33-3dec-4874-b352-fa81812517c7,98bfac77-82aa-4ffa-a63c-06f13e722550,2013-09-11T10:15:45-04:00,Hospital Lab,urn:uuid:d510ea01-7a5c-403a-8bb0-a5bccb975a93,Total Cholesterol
29,d79b10af-0929-46a2-a880-494d4affbba8,final,Lipid Panel,d76cce33-3dec-4874-b352-fa81812517c7,98bfac77-82aa-4ffa-a63c-06f13e722550,2013-09-11T10:15:45-04:00,Hospital Lab,urn:uuid:c4443e99-c58f-444e-b541-0f2353ed9b7b,Triglycerides


In [19]:
diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524 entries, 0 to 523
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Diagnostic Report ID  524 non-null    object
 1   Status                524 non-null    object
 2   Diagnostic Name       524 non-null    object
 3   Patient ID            524 non-null    object
 4   Encounter ID          524 non-null    object
 5   Diagnostic Date Time  524 non-null    object
 6   Performer             524 non-null    object
 7   result_reference      524 non-null    object
 8   result_display        524 non-null    object
dtypes: object(9)
memory usage: 37.0+ KB


In [47]:
pd.set_option('display.max_rows',70)
diagnostic['Encounter ID'].value_counts()

Encounter ID
b8edeb00-4f36-47b5-9b99-12a05ad57691    12
638fee81-9b8f-4896-8536-407e34db67c0    12
d53f9c2a-250a-49a9-9f8d-321ca30034bb    12
4aaea16e-0460-42f8-82de-aeaeef1b8004    12
c721f18e-4f6d-4e1d-a496-9de28fade1cf    12
4fe3ed75-2a61-4e14-bbbd-ab33b27b7059    12
84a4b571-3b59-4d89-a849-6f01a3c08a6a    12
1facf162-1cb8-4ee1-87ee-c2d68146094a    12
5b6a51a4-5c13-42b0-b885-8936c9cc36f3    12
ba8b290e-98ba-4c40-90bd-a0a500b339b8    12
01e1e451-d7c5-4876-b4e7-d499eaada844    12
9c6ee96e-445f-42b9-a711-d8acb2788bc3    12
1696a51a-fbb7-4a7a-9004-bb2ea5094087    12
0a4e4fe4-13e3-486d-b090-03093669f7f3    12
d8294703-8020-4f30-b8b5-01b76624d85c    12
fee0f09f-2c53-46e9-a50f-fe69d3238996    12
a14ca3d2-03e0-4d41-8d1a-77cba28bced7    12
8a7938da-4885-4dea-a9d1-d13912f06671    12
aa76d5b2-8587-4dae-b5f9-487ac3a89308    12
46eac620-9d19-4380-a6be-097412e2e698    12
5bfc7867-783b-47b1-aa65-d41de5d09ca5     8
0e41d495-029e-4338-8173-25e62f81a199     8
597eec22-90bc-42a0-b345-de76a95874f2     

In [23]:
len(diagnostic['Encounter ID'].unique())

69

In [27]:
diagnostic_to_merge = diagnostic[['Encounter ID','Diagnostic Report ID']]

In [28]:
diagnostic_to_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524 entries, 0 to 523
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Encounter ID          524 non-null    object
 1   Diagnostic Report ID  524 non-null    object
dtypes: object(2)
memory usage: 8.3+ KB


In [29]:
diagnostic_to_merge.head(10)

Unnamed: 0,Encounter ID,Diagnostic Report ID
0,e23fb9ad-56f4-4e34-8073-6c89277f03f8,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0
1,e23fb9ad-56f4-4e34-8073-6c89277f03f8,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0
2,e23fb9ad-56f4-4e34-8073-6c89277f03f8,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0
3,e23fb9ad-56f4-4e34-8073-6c89277f03f8,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0
4,0e41d495-029e-4338-8173-25e62f81a199,f74a3d92-b549-4a12-9e10-6a6d2204b59b
5,0e41d495-029e-4338-8173-25e62f81a199,f74a3d92-b549-4a12-9e10-6a6d2204b59b
6,0e41d495-029e-4338-8173-25e62f81a199,f74a3d92-b549-4a12-9e10-6a6d2204b59b
7,0e41d495-029e-4338-8173-25e62f81a199,f74a3d92-b549-4a12-9e10-6a6d2204b59b
8,0e41d495-029e-4338-8173-25e62f81a199,f74a3d92-b549-4a12-9e10-6a6d2204b59b
9,0e41d495-029e-4338-8173-25e62f81a199,f74a3d92-b549-4a12-9e10-6a6d2204b59b


In [32]:
diagnostic_to_merge_drop = diagnostic_to_merge.drop_duplicates(subset=['Diagnostic Report ID'])

In [33]:
diagnostic_to_merge_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 0 to 520
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Encounter ID          89 non-null     object
 1   Diagnostic Report ID  89 non-null     object
dtypes: object(2)
memory usage: 2.1+ KB


In [34]:
diagnostic_to_merge_drop.head(10)

Unnamed: 0,Encounter ID,Diagnostic Report ID
0,e23fb9ad-56f4-4e34-8073-6c89277f03f8,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0
4,0e41d495-029e-4338-8173-25e62f81a199,f74a3d92-b549-4a12-9e10-6a6d2204b59b
12,b2a27b96-82b2-4755-b5be-ec6aff5b8028,99940d72-22d0-4829-9e9e-3d89fa1510e0
20,1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd,18cb1577-3300-47b2-8d57-38f9cd8b950b
28,98bfac77-82aa-4ffa-a63c-06f13e722550,d79b10af-0929-46a2-a880-494d4affbba8
32,61bc1b33-2041-4d0e-850e-96701198a43d,c8bfc1b4-79fa-4a8e-9e0c-1d515e7bd7e0
36,fee0f09f-2c53-46e9-a50f-fe69d3238996,a4a4c2b0-f387-49fb-b7a0-e709a71fe1db
44,fee0f09f-2c53-46e9-a50f-fe69d3238996,9b475182-b259-41ac-be38-4a1023d9151c
48,d8294703-8020-4f30-b8b5-01b76624d85c,bbaf34b0-2859-4076-84d3-4fa787db0422
56,d8294703-8020-4f30-b8b5-01b76624d85c,c8093736-8ff1-46e3-9418-1c08955ff63e


In [36]:
len(diagnostic_to_merge_drop['Encounter ID'].unique())

69

In [48]:
diagnostic_to_merge_drop['Encounter ID'].value_counts()

Encounter ID
d53f9c2a-250a-49a9-9f8d-321ca30034bb    2
638fee81-9b8f-4896-8536-407e34db67c0    2
a14ca3d2-03e0-4d41-8d1a-77cba28bced7    2
4aaea16e-0460-42f8-82de-aeaeef1b8004    2
1facf162-1cb8-4ee1-87ee-c2d68146094a    2
b8edeb00-4f36-47b5-9b99-12a05ad57691    2
4fe3ed75-2a61-4e14-bbbd-ab33b27b7059    2
aa76d5b2-8587-4dae-b5f9-487ac3a89308    2
46eac620-9d19-4380-a6be-097412e2e698    2
84a4b571-3b59-4d89-a849-6f01a3c08a6a    2
5b6a51a4-5c13-42b0-b885-8936c9cc36f3    2
ba8b290e-98ba-4c40-90bd-a0a500b339b8    2
01e1e451-d7c5-4876-b4e7-d499eaada844    2
9c6ee96e-445f-42b9-a711-d8acb2788bc3    2
1696a51a-fbb7-4a7a-9004-bb2ea5094087    2
0a4e4fe4-13e3-486d-b090-03093669f7f3    2
d8294703-8020-4f30-b8b5-01b76624d85c    2
fee0f09f-2c53-46e9-a50f-fe69d3238996    2
8a7938da-4885-4dea-a9d1-d13912f06671    2
c721f18e-4f6d-4e1d-a496-9de28fade1cf    2
75ed8fbf-07d4-4738-8e08-03eec90ae63a    1
fcf50027-830e-4dd9-b62e-9e736d927e7d    1
511492b9-f42d-45bf-ad7c-19409c0e4122    1
156d9280-10ac-4286-b3

one encounter could have multiple diagnostic reports

one observation linked to one encounter which could be linked to multiple diagnostic.

In [49]:
diagnostic[diagnostic['Encounter ID'] == 'd53f9c2a-250a-49a9-9f8d-321ca30034bb']

Unnamed: 0,Diagnostic Report ID,Status,Diagnostic Name,Patient ID,Encounter ID,Diagnostic Date Time,Performer,result_reference,result_display
169,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:d1137901-93eb-4965-bf0d-7a6a62f8e138,Glucose
170,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:663ab79d-7d37-4569-aac3-cf4c83e8a925,Urea Nitrogen
171,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:741001b9-fa98-4cd1-963e-9bea1092bec1,Creatinine
172,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:79a049f2-09ec-4bbe-90ab-b9dafe540202,Calcium
173,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:4b31fec0-29a3-44d9-b24d-3a684c9ccbef,Sodium
174,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:d5643bea-fec0-454c-9330-e2dd671762d4,Potassium
175,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:e7093ab9-1f93-4085-8862-0139639a172e,Chloride
176,60a3adfa-b06c-44bd-9285-c181b8c3d88e,final,Basic Metabolic Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:538f3066-f95d-41ad-8060-37cec0c2a744,Carbon Dioxide
177,723f1bf8-e0c7-486f-b972-2f01d150b5c4,final,Lipid Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:ce9c8708-0988-4512-9e8d-c05b26c19eeb,Total Cholesterol
178,723f1bf8-e0c7-486f-b972-2f01d150b5c4,final,Lipid Panel,3a5357d9-13f3-42de-8fdc-14a1a6cae148,d53f9c2a-250a-49a9-9f8d-321ca30034bb,2011-06-02T20:58:47-04:00,Hospital Lab,urn:uuid:d25b5cce-494b-4278-bca2-f0c32cdc5585,Triglycerides


explore join for this

In [37]:
test = pd.merge(result_df,diagnostic_to_merge_drop,how='left',on='Encounter ID')

In [38]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1693 entries, 0 to 1692
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Observation ID         1693 non-null   object
 1   Status                 1693 non-null   object
 2   Patient ID             1693 non-null   object
 3   Encounter ID           1693 non-null   object
 4   Observation Date Time  1693 non-null   object
 5   Observation Type       1693 non-null   object
 6   Observation Value      1510 non-null   object
 7   Diagnostic Report ID   1290 non-null   object
dtypes: object(8)
memory usage: 105.9+ KB


In [76]:
test.describe()

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID
count,1693,1693,1693,1693,1693,1693,1510,1290
unique,1310,1,34,189,189,31,459,89
top,7b321269-d21a-4181-bfb6-a03287c7d798,final,aa3973d9-b64f-4a36-8cb6-f2719080b52f,9c6ee96e-445f-42b9-a711-d8acb2788bc3,2013-11-15T14:33:03-05:00,Body Weight,1.0 mg/dL,7e510599-af63-4cba-bf6b-3799c65c7854
freq,2,1693,361,40,40,179,65,20


In [107]:
test[test['Diagnostic Report ID'].isna()]

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID
0,166ab0d5-66bc-40fc-922b-2896cca3b0d9,final,65d12976-9588-4cfa-a795-216302a2ece9,a3e3bd43-ef3c-48fd-9851-5302d1aa84c6,2010-07-31T13:21:25-04:00,Oral temperature,37.3 Cel,
1,77bda9d9-e213-471f-9dec-9b5b03ff38b8,final,65d12976-9588-4cfa-a795-216302a2ece9,7e969f44-f034-48c1-82c9-0fcf4b24e20c,2010-09-15T10:46:26-04:00,Oral temperature,37.0 Cel,
2,322e6431-9a54-4e5f-9904-9ad4d5a3253b,final,65d12976-9588-4cfa-a795-216302a2ece9,a5e0fd67-d09f-4861-a58f-5726d81faafc,2010-11-14T10:21:49-05:00,Body Height,132.2 cm,
3,7d2b3ee3-1190-45fc-a876-df6b171c5c0b,final,65d12976-9588-4cfa-a795-216302a2ece9,a5e0fd67-d09f-4861-a58f-5726d81faafc,2010-11-14T10:21:49-05:00,Body Weight,36.1 kg,
4,3fd3dd3d-4382-4ae0-b1cc-7e5c9220da22,final,65d12976-9588-4cfa-a795-216302a2ece9,a5e0fd67-d09f-4861-a58f-5726d81faafc,2010-11-14T10:21:49-05:00,Body Mass Index,20.6 kg/m2,
...,...,...,...,...,...,...,...,...
1688,35dc154b-f576-4f61-aedc-1ce50bbea065,final,a5399e95-981c-4a78-8019-873662fc7901,9e802e2c-77ad-401f-97e9-bb91f646b9d7,2016-10-26T05:38:21-04:00,Body Height,180.1 cm,
1689,1e9a385e-e673-4dc0-9a35-24413d419e6a,final,a5399e95-981c-4a78-8019-873662fc7901,9e802e2c-77ad-401f-97e9-bb91f646b9d7,2016-10-26T05:38:21-04:00,Body Weight,85.6 kg,
1690,af37a792-b514-422b-b739-24034ff4a77a,final,a5399e95-981c-4a78-8019-873662fc7901,9e802e2c-77ad-401f-97e9-bb91f646b9d7,2016-10-26T05:38:21-04:00,Body Mass Index,26.4 kg/m2,
1691,9bb8bf87-aaef-417f-81e6-9060ec081be6,final,a5399e95-981c-4a78-8019-873662fc7901,9e802e2c-77ad-401f-97e9-bb91f646b9d7,2016-10-26T05:38:21-04:00,Blood Pressure,,


In [42]:
test[test['Observation ID'].isna()]

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID


In [51]:
test[test['Observation ID'].duplicated()]

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID
37,95d2ce05-8d62-41eb-a655-52fcd5c71b56,final,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,aa76d5b2-8587-4dae-b5f9-487ac3a89308,2010-07-26T02:41:32-04:00,Hemoglobin A1c/Hemoglobin.total in Blood,6.1 %,88e262b5-53fd-49a7-8057-a8a301757594
39,98944af2-78ce-41cb-84f0-19ea2b119a88,final,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,aa76d5b2-8587-4dae-b5f9-487ac3a89308,2010-07-26T02:41:32-04:00,Body Height,152.9 cm,88e262b5-53fd-49a7-8057-a8a301757594
41,3505cda9-c446-48f6-8b53-309511a5d3bd,final,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,aa76d5b2-8587-4dae-b5f9-487ac3a89308,2010-07-26T02:41:32-04:00,Body Weight,109.0 kg,88e262b5-53fd-49a7-8057-a8a301757594
43,4867ff78-ff2c-465b-b16b-42563729f9b3,final,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,aa76d5b2-8587-4dae-b5f9-487ac3a89308,2010-07-26T02:41:32-04:00,Body Mass Index,46.6 kg/m2,88e262b5-53fd-49a7-8057-a8a301757594
45,f8673fda-cef1-4f76-a77f-953f7f45486a,final,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,aa76d5b2-8587-4dae-b5f9-487ac3a89308,2010-07-26T02:41:32-04:00,Blood Pressure,,88e262b5-53fd-49a7-8057-a8a301757594
...,...,...,...,...,...,...,...,...
1684,e4e56824-be5d-40f5-800e-052841b16eea,final,aa3973d9-b64f-4a36-8cb6-f2719080b52f,5b6a51a4-5c13-42b0-b885-8936c9cc36f3,2017-01-25T15:04:04-05:00,Triglycerides,100.0 mg/dL,74bc393d-eb75-42f9-8132-45ccc3165b60
1686,b7b2e34e-b5a2-4879-9305-e20d9df9d6f5,final,aa3973d9-b64f-4a36-8cb6-f2719080b52f,5b6a51a4-5c13-42b0-b885-8936c9cc36f3,2017-01-25T15:04:04-05:00,Low Density Lipoprotein Cholesterol,91.0 mg/dL,74bc393d-eb75-42f9-8132-45ccc3165b60
1688,ed9fbc19-242d-4044-8900-1e43f62dd116,final,aa3973d9-b64f-4a36-8cb6-f2719080b52f,5b6a51a4-5c13-42b0-b885-8936c9cc36f3,2017-01-25T15:04:04-05:00,High Density Lipoprotein Cholesterol,75.0 mg/dL,74bc393d-eb75-42f9-8132-45ccc3165b60
1690,1102d6e4-724c-4593-bbc4-81416860431f,final,aa3973d9-b64f-4a36-8cb6-f2719080b52f,5b6a51a4-5c13-42b0-b885-8936c9cc36f3,2017-01-25T15:04:04-05:00,Microalbumin Creatine Ratio,12.0 mg/g,74bc393d-eb75-42f9-8132-45ccc3165b60


In [52]:
test[test['Observation ID'] == '95d2ce05-8d62-41eb-a655-52fcd5c71b56']

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID
36,95d2ce05-8d62-41eb-a655-52fcd5c71b56,final,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,aa76d5b2-8587-4dae-b5f9-487ac3a89308,2010-07-26T02:41:32-04:00,Hemoglobin A1c/Hemoglobin.total in Blood,6.1 %,fd75cd83-df55-45a0-9ff6-34ebf82e2864
37,95d2ce05-8d62-41eb-a655-52fcd5c71b56,final,bb357dc7-2f8a-444e-ba57-8291b6f18bf0,aa76d5b2-8587-4dae-b5f9-487ac3a89308,2010-07-26T02:41:32-04:00,Hemoglobin A1c/Hemoglobin.total in Blood,6.1 %,88e262b5-53fd-49a7-8057-a8a301757594


## Observation Data Extraction

In [1]:
import pandas as pd
import glob

In [3]:
def extract_observation_data(root_directory_path):
    file_paths = glob.glob(f"{root_directory_path}*.json")
    dfs = [pd.read_json(file_path) for file_path in file_paths]
    combined_df = pd.concat(dfs, ignore_index=True)
    entries = pd.json_normalize(combined_df['entry'])

    observation_data = entries[entries['resource.resourceType'] == 'Observation']
    observation_data.reset_index(drop=True,inplace=True)

    observation_id = observation_data.get('resource.id',None)
    observation_status = observation_data.get('resource.status',None)
    #observation type
    observation_type_raw = observation_data.get('resource.code.coding',None)
    if not observation_type_raw is None:
        observation_type = pd.json_normalize(observation_type_raw.explode()).get('display',None)
    else:
        observation_type = None
    #observation value
    observation_value = observation_data.get('resource.valueQuantity.value',None).apply(lambda x: round(x,1))
    observation_unit = observation_data.get('resource.valueQuantity.unit',None)
    if not observation_value is None:
        observation_value_unit = observation_value.astype('str').str.cat(observation_unit,sep=' ')
    else:
        observation_value_unit = None
    #observatoin date time
    observation_date_time = observation_data.get('resource.effectiveDateTime',None)
    observation_encounter_id = observation_data['resource.encounter.reference'].str.replace('urn:uuid:','')
    observation_patient_id = observation_data['resource.subject.reference'].str.replace('urn:uuid:','')

    result_df = pd.DataFrame({
    'Observation ID':observation_id,
    'Status':observation_status,
    'Patient ID':observation_patient_id,
    'Encounter ID':observation_encounter_id,
    'Observation Date Time':observation_date_time,
    'Observation Type':observation_type,
    'Observation Value':observation_value_unit
    })

    diagnostic = pd.read_csv('./output_csv/diagnostic_data.csv')
    diagnostic_to_merge = diagnostic[['Encounter ID','Diagnostic Report ID']]
    diagnostic_to_merge_drop = diagnostic_to_merge.drop_duplicates(subset=['Diagnostic Report ID'])
    final_df = pd.merge(result_df,diagnostic_to_merge_drop,how='left',on='Encounter ID')

    final_df.to_csv('./output_csv/observatoin_data.csv',index=False)

In [4]:
extract_observation_data('./health_data/healthcare-datasets/')

# Process Observation data

In [5]:
import pandas as pd
observation = pd.read_csv('./output_csv/observatoin_data.csv')

In [6]:
observation.head()

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID
0,166ab0d5-66bc-40fc-922b-2896cca3b0d9,final,65d12976-9588-4cfa-a795-216302a2ece9,a3e3bd43-ef3c-48fd-9851-5302d1aa84c6,2010-07-31T13:21:25-04:00,Oral temperature,37.3 Cel,
1,77bda9d9-e213-471f-9dec-9b5b03ff38b8,final,65d12976-9588-4cfa-a795-216302a2ece9,7e969f44-f034-48c1-82c9-0fcf4b24e20c,2010-09-15T10:46:26-04:00,Oral temperature,37.0 Cel,
2,322e6431-9a54-4e5f-9904-9ad4d5a3253b,final,65d12976-9588-4cfa-a795-216302a2ece9,a5e0fd67-d09f-4861-a58f-5726d81faafc,2010-11-14T10:21:49-05:00,Body Height,132.2 cm,
3,7d2b3ee3-1190-45fc-a876-df6b171c5c0b,final,65d12976-9588-4cfa-a795-216302a2ece9,a5e0fd67-d09f-4861-a58f-5726d81faafc,2010-11-14T10:21:49-05:00,Body Weight,36.1 kg,
4,3fd3dd3d-4382-4ae0-b1cc-7e5c9220da22,final,65d12976-9588-4cfa-a795-216302a2ece9,a5e0fd67-d09f-4861-a58f-5726d81faafc,2010-11-14T10:21:49-05:00,Body Mass Index,20.6 kg/m2,


In [10]:
observation[observation['Diagnostic Report ID'].isna()].describe()

Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID
count,403,403,403,403,403,403,309,0.0
unique,403,1,24,120,120,15,184,0.0
top,166ab0d5-66bc-40fc-922b-2896cca3b0d9,final,2f3f9a7e-8142-40c8-8c4a-b58bc815b48d,8980d9a8-66ee-4163-8910-35ba6584b556,2016-06-03T10:03:58-04:00,Body Height,38.0 Cel,
freq,1,403,39,6,6,94,10,


In [11]:
observation[observation['Diagnostic Report ID'].notna()].describe()


Unnamed: 0,Observation ID,Status,Patient ID,Encounter ID,Observation Date Time,Observation Type,Observation Value,Diagnostic Report ID
count,1290,1290,1290,1290,1290,1290,1201,1290
unique,907,1,23,69,69,22,313,89
top,9d9b8fe9-f080-435e-a36c-2c12735b21bc,final,aa3973d9-b64f-4a36-8cb6-f2719080b52f,b8edeb00-4f36-47b5-9b99-12a05ad57691,2014-09-11T11:29:25-04:00,Hemoglobin A1c/Hemoglobin.total in Blood,1.0 mg/dL,7e510599-af63-4cba-bf6b-3799c65c7854
freq,2,1290,360,40,40,130,65,20
