In [1]:
import pandas as pd
import glob

In [2]:
# def extract_diagnostic_data(root_directory_path):
    # file_paths = glob.glob(f"{root_directory_path}*.json")
file_paths = glob.glob(f"./health_data/healthcare-datasets/*.json")
dfs = [pd.read_json(file_path) for file_path in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)
entries = pd.json_normalize(combined_df['entry'])
    

In [3]:
entries['resource.resourceType'].unique()

array(['Patient', 'Encounter', 'Condition', 'Observation',
       'MedicationRequest', 'Immunization', 'Procedure', 'CarePlan',
       'AllergyIntolerance', 'DiagnosticReport'], dtype=object)

In [4]:
diagnostic_data = entries[entries['resource.resourceType'] == 'DiagnosticReport']
diagnostic_data.reset_index(drop=True,inplace=True)


In [5]:

diagnostic_status = diagnostic_data.get('resource.status',None)
diagnostic_status

0     final
1     final
2     final
3     final
4     final
      ...  
84    final
85    final
86    final
87    final
88    final
Name: resource.status, Length: 89, dtype: object

In [6]:
diagnostic_name_raw = diagnostic_data.get('resource.code.coding',None)
if not diagnostic_name_raw is None:
    diagnostic_name_intermediate = pd.json_normalize(diagnostic_name_raw.explode())
    diagnostic_name = diagnostic_name_intermediate.get('display',None)
    display(diagnostic_name)

0                                           Lipid Panel
1                                 Basic Metabolic Panel
2                                 Basic Metabolic Panel
3                                 Basic Metabolic Panel
4                                           Lipid Panel
                            ...                        
84                                Basic Metabolic Panel
85                                Basic Metabolic Panel
86    U.S. standard certificate of death - 2003 revi...
87                                          Lipid Panel
88                                          Lipid Panel
Name: display, Length: 89, dtype: object

In [7]:
diagnostic_date_time = diagnostic_data.get('resource.effectiveDateTime',None)
diagnostic_date_time

0     2014-03-19T16:56:31-04:00
1     2010-10-30T16:48:25-04:00
2     2011-11-18T07:54:42-05:00
3     2012-10-02T21:50:16-04:00
4     2013-09-11T10:15:45-04:00
                ...            
84    2015-06-29T20:52:05-04:00
85    2016-07-22T20:20:06-04:00
86    1986-06-24T04:50:26-04:00
87    2010-08-17T16:46:17-04:00
88    2014-02-20T22:44:54-05:00
Name: resource.effectiveDateTime, Length: 89, dtype: object

In [17]:
diagnostic_performer_raw = diagnostic_data.get('resource.performer',None)
if not diagnostic_performer_raw is None:
    diagnostic_perfomer = pd.json_normalize(diagnostic_performer_raw.explode()).get('display',None)
else:
    diagnostic_perfomer = None
diagnostic_perfomer


0     Hospital Lab
1     Hospital Lab
2     Hospital Lab
3     Hospital Lab
4     Hospital Lab
          ...     
84    Hospital Lab
85    Hospital Lab
86    Hospital Lab
87    Hospital Lab
88    Hospital Lab
Name: display, Length: 89, dtype: object

In [20]:
diagnostic_encounter_id = diagnostic_data['resource.encounter.reference'].str.replace('urn:uuid:','')
diagnostic_encounter_id

0     e23fb9ad-56f4-4e34-8073-6c89277f03f8
1     0e41d495-029e-4338-8173-25e62f81a199
2     b2a27b96-82b2-4755-b5be-ec6aff5b8028
3     1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd
4     98bfac77-82aa-4ffa-a63c-06f13e722550
                      ...                 
84    4a7757cc-7803-4513-93cb-1923d571534d
85    a4431145-6284-4220-ad71-00593251099d
86    dc68d226-b62b-45b8-baab-a1cc6d26427f
87    b7bb6bd2-9c67-44c5-9595-43a0844262fb
88    d90f6271-c1ce-4d06-831e-060142761020
Name: resource.encounter.reference, Length: 89, dtype: object

In [21]:
diagnostic_patient_id = diagnostic_data['resource.subject.reference'].str.replace('urn:uuid:','')
diagnostic_patient_id

0     a195633e-d36d-4abe-92bb-02994e9cb348
1     d0ed1873-9ac6-4c98-813e-86a041ce78c5
2     d0ed1873-9ac6-4c98-813e-86a041ce78c5
3     d0ed1873-9ac6-4c98-813e-86a041ce78c5
4     d76cce33-3dec-4874-b352-fa81812517c7
                      ...                 
84    bb357dc7-2f8a-444e-ba57-8291b6f18bf0
85    bb357dc7-2f8a-444e-ba57-8291b6f18bf0
86    1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b
87    a5399e95-981c-4a78-8019-873662fc7901
88    a5399e95-981c-4a78-8019-873662fc7901
Name: resource.subject.reference, Length: 89, dtype: object

In [34]:
diagnostic_id = diagnostic_data['resource.id']
diagnostic_id

0     a07a9a48-de2f-4b5a-896e-c44d56eeb3c0
1     f74a3d92-b549-4a12-9e10-6a6d2204b59b
2     99940d72-22d0-4829-9e9e-3d89fa1510e0
3     18cb1577-3300-47b2-8d57-38f9cd8b950b
4     d79b10af-0929-46a2-a880-494d4affbba8
                      ...                 
84    4441a96f-8d8f-413f-9e9d-35c42a4068a5
85    ba5b3f7b-2fb9-46b5-b4f0-56258b7fcf6f
86    2b638fda-a7aa-4c83-940c-0fc1b0275993
87    9422835c-c46a-4479-a851-ac09d1f23b08
88    4376e02a-1e53-4d33-b127-1731d53b1dfb
Name: resource.id, Length: 89, dtype: object

In [32]:
diagnostic_result = diagnostic_data.get('resource.result',None)
diagnostic_result

0     [{'reference': 'urn:uuid:616bd45e-f48a-4935-be...
1     [{'reference': 'urn:uuid:9d0430d9-d252-4eb7-b9...
2     [{'reference': 'urn:uuid:71e8b639-ada6-4a74-9f...
3     [{'reference': 'urn:uuid:ad0ecd5a-ca4c-41ef-a8...
4     [{'reference': 'urn:uuid:d510ea01-7a5c-403a-8b...
                            ...                        
84    [{'reference': 'urn:uuid:5d1035d8-4cc3-4477-8d...
85    [{'reference': 'urn:uuid:3eeb2fc0-7e1a-4916-89...
86    [{'reference': 'urn:uuid:c757f1fb-b069-46d4-87...
87    [{'reference': 'urn:uuid:d5a8ba27-a56a-4d24-94...
88    [{'reference': 'urn:uuid:8e653cef-54fa-4b8c-ad...
Name: resource.result, Length: 89, dtype: object

extracting only the reference part of the resutl

In [31]:
diagnostic_result_reference_lst = diagnostic_result.apply(lambda lst: [d['reference'] for d in lst])

## Diagnostic Data Extraction

In [42]:
import pandas as pd
import glob

In [125]:
def extract_diagnostic_data(root_directory_path):
    file_paths = glob.glob(f"{root_directory_path}*.json")
    dfs = [pd.read_json(file_path) for file_path in file_paths]
    combined_df = pd.concat(dfs, ignore_index=True)
    entries = pd.json_normalize(combined_df['entry'])

    diagnostic_data = entries[entries['resource.resourceType'] == 'DiagnosticReport']
    diagnostic_data.reset_index(drop=True,inplace=True)

    diagnostic_status = diagnostic_data.get('resource.status',None)
    diagnostic_name_raw = diagnostic_data.get('resource.code.coding',None)
    if not diagnostic_name_raw is None:
        diagnostic_name_intermediate = pd.json_normalize(diagnostic_name_raw.explode())
        diagnostic_name = diagnostic_name_intermediate.get('display',None)
    else:
        diagnostic_name = None
    diagnostic_date_time = diagnostic_data.get('resource.effectiveDateTime',None)
    diagnostic_performer_raw = diagnostic_data.get('resource.performer',None)
    if not diagnostic_performer_raw is None:
        diagnostic_perfomer = pd.json_normalize(diagnostic_performer_raw.explode()).get('display',None)
    else:
        diagnostic_perfomer = None
    diagnostic_encounter_id = diagnostic_data['resource.encounter.reference'].str.replace('urn:uuid:','')
    diagnostic_patient_id = diagnostic_data['resource.subject.reference'].str.replace('urn:uuid:','')
    diagnostic_id = diagnostic_data['resource.id']

    diagnostic_result = diagnostic_data.get('resource.result',None)

    result_df = pd.DataFrame({
        'Diagnostic Report ID':diagnostic_id,
        'Status':diagnostic_status,
        'Diagnostic Name':diagnostic_name,
        'Patient ID':diagnostic_patient_id,
        'Encounter ID':diagnostic_encounter_id,
        'Diagnostic Date Time':diagnostic_date_time,
        'Performer':diagnostic_perfomer,
        'Result':diagnostic_result,
    })
    result_df = result_df.explode('Result')
    diagnostic_result_reference = result_df['Result'].apply(lambda lst:lst['reference'])
    diagnostic_result_display = result_df['Result'].apply(lambda lst:lst['display'])
    result_df['result_reference'] = diagnostic_result_reference
    result_df['result_display'] = diagnostic_result_display
    result_df.drop(columns=['Result'],inplace=True)

    # display(result_df.head())
    result_df.to_csv('./output_csv/diagnostic_data.csv',index=False)

In [126]:
extract_diagnostic_data('./health_data/healthcare-datasets/')

# Process Diagnostic Data

In [1]:
import pandas as pd
diagnostic = pd.read_csv('./output_csv/diagnostic_data.csv')

In [3]:
diagnostic.head()

Unnamed: 0,Diagnostic Report ID,Status,Diagnostic Name,Patient ID,Encounter ID,Diagnostic Date Time,Performer,result_reference,result_display
0,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:616bd45e-f48a-4935-be31-61f3d2c91a58,Total Cholesterol
1,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:d8d0648f-7c89-4ab1-a877-bf88e3ee69a8,Triglycerides
2,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:76c1a941-d92a-4aee-ad0b-11e09939069e,Low Density Lipoprotein Cholesterol
3,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:f745b0d8-0458-411d-966f-97a8fbf09398,High Density Lipoprotein Cholesterol
4,f74a3d92-b549-4a12-9e10-6a6d2204b59b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,0e41d495-029e-4338-8173-25e62f81a199,2010-10-30T16:48:25-04:00,Hospital Lab,urn:uuid:9d0430d9-d252-4eb7-b9e6-3229bfbc477d,Glucose


In [129]:
diagnostic.describe()

Unnamed: 0,Diagnostic Report ID,Status,Diagnostic Name,Patient ID,Encounter ID,Diagnostic Date Time,Performer,result_reference,result_display
count,524,524,524,524,524,524,524,524,524
unique,89,1,3,23,69,69,1,524,13
top,31f7ba53-5e87-44f3-bf78-c908b69ed740,final,Basic Metabolic Panel,aa3973d9-b64f-4a36-8cb6-f2719080b52f,b8edeb00-4f36-47b5-9b99-12a05ad57691,2014-09-11T11:29:25-04:00,Hospital Lab,urn:uuid:616bd45e-f48a-4935-be31-61f3d2c91a58,Glucose
freq,8,524,360,108,12,12,524,1,45


In [130]:
pd.set_option('display.max_colwidth',400)
diagnostic[['Diagnostic Report ID','result_reference','result_display']][:4]

Unnamed: 0,Diagnostic Report ID,result_reference,result_display
0,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,urn:uuid:616bd45e-f48a-4935-be31-61f3d2c91a58,Total Cholesterol
1,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,urn:uuid:d8d0648f-7c89-4ab1-a877-bf88e3ee69a8,Triglycerides
2,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,urn:uuid:76c1a941-d92a-4aee-ad0b-11e09939069e,Low Density Lipoprotein Cholesterol
3,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,urn:uuid:f745b0d8-0458-411d-966f-97a8fbf09398,High Density Lipoprotein Cholesterol


In [4]:
diagnostic['Encounter ID'].unique()

array(['e23fb9ad-56f4-4e34-8073-6c89277f03f8',
       '0e41d495-029e-4338-8173-25e62f81a199',
       'b2a27b96-82b2-4755-b5be-ec6aff5b8028',
       '1f6fc603-26d8-4dd2-b8c5-3fb1bbdb12dd',
       '98bfac77-82aa-4ffa-a63c-06f13e722550',
       '61bc1b33-2041-4d0e-850e-96701198a43d',
       'fee0f09f-2c53-46e9-a50f-fe69d3238996',
       'd8294703-8020-4f30-b8b5-01b76624d85c',
       '0a4e4fe4-13e3-486d-b090-03093669f7f3',
       '1696a51a-fbb7-4a7a-9004-bb2ea5094087',
       '9c6ee96e-445f-42b9-a711-d8acb2788bc3',
       '01e1e451-d7c5-4876-b4e7-d499eaada844',
       '638fee81-9b8f-4896-8536-407e34db67c0',
       'ba8b290e-98ba-4c40-90bd-a0a500b339b8',
       '5b6a51a4-5c13-42b0-b885-8936c9cc36f3',
       'f5b01614-f569-4e9e-b482-cc0231a7c20a',
       'ae0ab80e-1d3c-4c78-9ff5-1864cb415c0e',
       '353bd039-42e0-4750-981c-50d594b67a4d',
       '5948265c-890d-4b41-9b89-61b195cfa4c5',
       'e474dde0-32b2-404c-8622-bc008af313e4',
       '6151bdfd-192a-457e-9db2-95f9ef8f636f',
       'd53f9

In [5]:
diagnostic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524 entries, 0 to 523
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Diagnostic Report ID  524 non-null    object
 1   Status                524 non-null    object
 2   Diagnostic Name       524 non-null    object
 3   Patient ID            524 non-null    object
 4   Encounter ID          524 non-null    object
 5   Diagnostic Date Time  524 non-null    object
 6   Performer             524 non-null    object
 7   result_reference      524 non-null    object
 8   result_display        524 non-null    object
dtypes: object(9)
memory usage: 37.0+ KB


In [6]:
diagnostic[diagnostic.duplicated(subset=['Diagnostic Report ID'])]

Unnamed: 0,Diagnostic Report ID,Status,Diagnostic Name,Patient ID,Encounter ID,Diagnostic Date Time,Performer,result_reference,result_display
1,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:d8d0648f-7c89-4ab1-a877-bf88e3ee69a8,Triglycerides
2,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:76c1a941-d92a-4aee-ad0b-11e09939069e,Low Density Lipoprotein Cholesterol
3,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:f745b0d8-0458-411d-966f-97a8fbf09398,High Density Lipoprotein Cholesterol
5,f74a3d92-b549-4a12-9e10-6a6d2204b59b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,0e41d495-029e-4338-8173-25e62f81a199,2010-10-30T16:48:25-04:00,Hospital Lab,urn:uuid:bbcc29fd-f9ba-493b-93d3-ea447e5a4767,Urea Nitrogen
6,f74a3d92-b549-4a12-9e10-6a6d2204b59b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,0e41d495-029e-4338-8173-25e62f81a199,2010-10-30T16:48:25-04:00,Hospital Lab,urn:uuid:142376f3-b7fc-4fb4-b242-65b0d6e5be73,Creatinine
...,...,...,...,...,...,...,...,...,...
518,9422835c-c46a-4479-a851-ac09d1f23b08,final,Lipid Panel,a5399e95-981c-4a78-8019-873662fc7901,b7bb6bd2-9c67-44c5-9595-43a0844262fb,2010-08-17T16:46:17-04:00,Hospital Lab,urn:uuid:9844d323-287c-4457-a25f-aa958bbb3f53,Low Density Lipoprotein Cholesterol
519,9422835c-c46a-4479-a851-ac09d1f23b08,final,Lipid Panel,a5399e95-981c-4a78-8019-873662fc7901,b7bb6bd2-9c67-44c5-9595-43a0844262fb,2010-08-17T16:46:17-04:00,Hospital Lab,urn:uuid:4d170e5a-3ed0-466b-9d0e-b628f2a39bc6,High Density Lipoprotein Cholesterol
521,4376e02a-1e53-4d33-b127-1731d53b1dfb,final,Lipid Panel,a5399e95-981c-4a78-8019-873662fc7901,d90f6271-c1ce-4d06-831e-060142761020,2014-02-20T22:44:54-05:00,Hospital Lab,urn:uuid:8db05e1e-3cff-449c-a1a4-54779020fb80,Triglycerides
522,4376e02a-1e53-4d33-b127-1731d53b1dfb,final,Lipid Panel,a5399e95-981c-4a78-8019-873662fc7901,d90f6271-c1ce-4d06-831e-060142761020,2014-02-20T22:44:54-05:00,Hospital Lab,urn:uuid:3dcc9ef2-fc12-4b09-8654-b80daae05948,Low Density Lipoprotein Cholesterol


In [8]:
diagnostic.drop_duplicates(subset=['Diagnostic Report ID']).info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 0 to 520
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Diagnostic Report ID  89 non-null     object
 1   Status                89 non-null     object
 2   Diagnostic Name       89 non-null     object
 3   Patient ID            89 non-null     object
 4   Encounter ID          89 non-null     object
 5   Diagnostic Date Time  89 non-null     object
 6   Performer             89 non-null     object
 7   result_reference      89 non-null     object
 8   result_display        89 non-null     object
dtypes: object(9)
memory usage: 7.0+ KB


In [3]:
len(diagnostic[diagnostic['Diagnostic Report ID'].isna()])

0

In [4]:
diagnostic.head()

Unnamed: 0,Diagnostic Report ID,Status,Diagnostic Name,Patient ID,Encounter ID,Diagnostic Date Time,Performer,result_reference,result_display
0,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:616bd45e-f48a-4935-be31-61f3d2c91a58,Total Cholesterol
1,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:d8d0648f-7c89-4ab1-a877-bf88e3ee69a8,Triglycerides
2,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:76c1a941-d92a-4aee-ad0b-11e09939069e,Low Density Lipoprotein Cholesterol
3,a07a9a48-de2f-4b5a-896e-c44d56eeb3c0,final,Lipid Panel,a195633e-d36d-4abe-92bb-02994e9cb348,e23fb9ad-56f4-4e34-8073-6c89277f03f8,2014-03-19T16:56:31-04:00,Hospital Lab,urn:uuid:f745b0d8-0458-411d-966f-97a8fbf09398,High Density Lipoprotein Cholesterol
4,f74a3d92-b549-4a12-9e10-6a6d2204b59b,final,Basic Metabolic Panel,d0ed1873-9ac6-4c98-813e-86a041ce78c5,0e41d495-029e-4338-8173-25e62f81a199,2010-10-30T16:48:25-04:00,Hospital Lab,urn:uuid:9d0430d9-d252-4eb7-b9e6-3229bfbc477d,Glucose
