In [2]:
import pandas as  pd
import glob

In [3]:
file_paths = glob.glob(f"./health_data/healthcare-datasets/*.json")
dfs = [pd.read_json(file_path) for file_path in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)
entries = pd.json_normalize(combined_df['entry'])

In [4]:
entries['resource.resourceType'].unique()

array(['Patient', 'Encounter', 'Condition', 'Observation',
       'MedicationRequest', 'Immunization', 'Procedure', 'CarePlan',
       'AllergyIntolerance', 'DiagnosticReport'], dtype=object)

In [7]:
care_plan_data = entries[entries['resource.resourceType'] == 'CarePlan']
care_plan_data.reset_index(drop=True,inplace=True)
care_plan_data.columns

Index(['fullUrl', 'resource.id', 'resource.text.status', 'resource.text.div',
       'resource.extension', 'resource.identifier', 'resource.name',
       'resource.telecom', 'resource.gender', 'resource.birthDate',
       'resource.address', 'resource.multipleBirthBoolean', 'resource.photo',
       'resource.resourceType', 'resource.status', 'resource.class.code',
       'resource.type', 'resource.patient.reference', 'resource.period.start',
       'resource.period.end', 'resource.reason.coding',
       'resource.clinicalStatus', 'resource.verificationStatus',
       'resource.code.coding', 'resource.subject.reference',
       'resource.context.reference', 'resource.onsetDateTime',
       'resource.abatementDateTime', 'resource.encounter.reference',
       'resource.effectiveDateTime', 'resource.valueQuantity.value',
       'resource.valueQuantity.unit', 'resource.valueQuantity.system',
       'resource.valueQuantity.code', 'resource.stage.coding',
       'resource.medicationCodeableCo

## Normalization!!!

In [25]:
care_plan_category = care_plan_data.get('resource.category',None)
if not care_plan_category is None:
    care_plan_code = pd.json_normalize(care_plan_category,[0,'coding']).get('code',None)
else:
    care_plan_code = None
care_plan_code

0            53950000
1           134435003
2     326051000000105
3     326051000000105
4     872781000000100
           ...       
73          412776001
74          182964004
75          385691007
76           47387005
77          385691007
Name: code, Length: 78, dtype: object

In [26]:
if not care_plan_category is None:
    care_plan_name = pd.json_normalize(care_plan_category,[0,'coding']).get('display',None)
else:
    care_plan_name = None
care_plan_name

0                                   Respiratory therapy
1                                Routine antenatal care
2                                             Self care
3                                             Self care
4                                  Musculoskeletal care
                            ...                        
73    Chronic obstructive pulmonary disease clinical...
74                                        Terminal care
75                                        Fracture care
76                           Head injury rehabilitation
77                                        Fracture care
Name: display, Length: 78, dtype: object

In [24]:
care_plan_status = care_plan_data.get('resource.status',None)
care_plan_status

0     completed
1     completed
2        active
3        active
4     completed
        ...    
73       active
74       active
75    completed
76    completed
77    completed
Name: resource.status, Length: 78, dtype: object

In [27]:
care_plan_start_date = care_plan_data.get('resource.period.start',None)
care_plan_start_date

0     2015-01-07
1     2015-05-08
2     1976-11-24
3     1976-12-07
4     2016-09-11
         ...    
73    1956-09-30
74    1985-10-29
75    2013-04-21
76    2009-12-24
77    2014-02-07
Name: resource.period.start, Length: 78, dtype: object

In [28]:
care_plan_end_date = care_plan_data.get('resource.period.end',None)
care_plan_end_date

0     2015-11-01
1     2015-12-04
2            NaN
3            NaN
4     2016-10-11
         ...    
73           NaN
74           NaN
75    2013-08-17
76    2010-03-06
77    2014-05-18
Name: resource.period.end, Length: 78, dtype: object

In [29]:
care_plan_patient_id = care_plan_data['resource.subject.reference'].str.replace('urn:uuid:','')
care_plan_patient_id

0     65d12976-9588-4cfa-a795-216302a2ece9
1     65d12976-9588-4cfa-a795-216302a2ece9
2     a195633e-d36d-4abe-92bb-02994e9cb348
3     a195633e-d36d-4abe-92bb-02994e9cb348
4     a195633e-d36d-4abe-92bb-02994e9cb348
                      ...                 
73    1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b
74    1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b
75    00d869e2-6793-4ebf-9340-38bf18d223c4
76    a5399e95-981c-4a78-8019-873662fc7901
77    a5399e95-981c-4a78-8019-873662fc7901
Name: resource.subject.reference, Length: 78, dtype: object

In [30]:
care_plan_encounter_id = care_plan_data['resource.context.reference'].str.replace('urn:uuid:','')
care_plan_encounter_id

0     ddc2fb3c-6ea7-4b66-931b-0a0b4bfd3a75
1     80fd8fc5-4770-4e35-b31c-4d7b1be19c2f
2     fe054716-3d1d-4652-8564-01a86d58cd87
3     fe054716-3d1d-4652-8564-01a86d58cd87
4     9b133606-85fd-4199-adf4-2fdf90027fed
                      ...                 
73    dc68d226-b62b-45b8-baab-a1cc6d26427f
74    dc68d226-b62b-45b8-baab-a1cc6d26427f
75    8625df2e-3c5d-4dc2-83c7-714a126d407e
76    14e39e12-688a-4a2d-942d-d761cff4123e
77    7f88e6e7-b838-497a-86d6-aaf35b8cdf29
Name: resource.context.reference, Length: 78, dtype: object

In [31]:
result_df = pd.DataFrame({
'Care Plan Code':care_plan_code,
'Status':care_plan_status,
'Care Plan Name':care_plan_name,
'Patient ID':care_plan_patient_id,
'Encounter ID':care_plan_encounter_id,
'Care Plan Start Date':care_plan_start_date,
'Care Plan End Date':care_plan_end_date
})
result_df.describe()

Unnamed: 0,Care Plan Code,Status,Care Plan Name,Patient ID,Encounter ID,Care Plan Start Date,Care Plan End Date
count,78,78,78,78,78,78,40
unique,16,2,16,26,52,78,40
top,134435003,completed,Routine antenatal care,7384d82c-2d1e-4595-99e4-f0ae962dddf1,f73f194d-1df1-42f9-8fdf-325cc87b2348,2015-01-07,2015-11-01
freq,14,40,14,6,6,1,1


In [33]:
result_df.groupby('Patient ID').agg({
    'Care Plan Code':lambda x: list(x.unique())
})

Unnamed: 0_level_0,Care Plan Code
Patient ID,Unnamed: 1_level_1
00d869e2-6793-4ebf-9340-38bf18d223c4,[385691007]
117335bb-33ff-4897-ac27-af4b3f11135f,"[225358003, 91251008]"
1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b,"[412776001, 182964004]"
21e19329-b572-414b-9cd1-ecf96af4f568,"[326051000000105, 412776001]"
32c72cdb-66bf-4301-895e-16da54ffd72d,"[869761000000107, 225358003]"
3a5357d9-13f3-42de-8fdc-14a1a6cae148,"[326051000000105, 698360004, 386257007]"
509474a0-baf3-48f2-a595-67b6e2b34c87,[53950000]
5265a2b0-1bbc-4993-9042-b8d86ba65f10,"[134435003, 698360004]"
65d12976-9588-4cfa-a795-216302a2ece9,"[53950000, 134435003]"
7384d82c-2d1e-4595-99e4-f0ae962dddf1,"[711282006, 326051000000105, 699728000, 385691..."


In [36]:
result_df.groupby(['Care Plan Code','Care Plan Name']).agg({
    'Patient ID':lambda x: list(x.unique())
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Patient ID
Care Plan Code,Care Plan Name,Unnamed: 2_level_1
134435003,Routine antenatal care,"[65d12976-9588-4cfa-a795-216302a2ece9, 80bb41d..."
182964004,Terminal care,[1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b]
225358003,Wound care,"[d0ed1873-9ac6-4c98-813e-86a041ce78c5, 117335b..."
326051000000105,Self care,"[a195633e-d36d-4abe-92bb-02994e9cb348, 3a5357d..."
385691007,Fracture care,"[bd0acd69-580e-4896-9974-2ad9d041ca0b, 7384d82..."
386257007,Demential management,[3a5357d9-13f3-42de-8fdc-14a1a6cae148]
412776001,Chronic obstructive pulmonary disease clinical management plan,"[f10fd607-58ce-4be6-b1bd-a98898c8e0e5, cb13a48..."
47387005,Head injury rehabilitation,[a5399e95-981c-4a78-8019-873662fc7901]
53950000,Respiratory therapy,"[65d12976-9588-4cfa-a795-216302a2ece9, d0ed187..."
698358001,Angina self management plan,"[d0ed1873-9ac6-4c98-813e-86a041ce78c5, aa3973d..."


## Care Plan Data Extraction

In [37]:
import pandas as pd
import glob
def extract_care_plan_data(root_directory_path):
    file_paths = glob.glob(f"{root_directory_path}*.json")
    dfs = [pd.read_json(file_path) for file_path in file_paths]
    combined_df = pd.concat(dfs, ignore_index=True)
    entries = pd.json_normalize(combined_df['entry'])

    care_plan_data = entries[entries['resource.resourceType'] == 'CarePlan']
    care_plan_data.reset_index(drop=True,inplace=True)

    care_plan_category = care_plan_data.get('resource.category',None)
    if not care_plan_category is None:
        care_plan_code = pd.json_normalize(care_plan_category,[0,'coding']).get('code',None)
    else:
        care_plan_code = None
    if not care_plan_category is None:
        care_plan_name = pd.json_normalize(care_plan_category,[0,'coding']).get('display',None)
    else:
        care_plan_name = None
    care_plan_status = care_plan_data.get('resource.status',None)
    care_plan_start_date = care_plan_data.get('resource.period.start',None)
    care_plan_end_date = care_plan_data.get('resource.period.end',None)
    care_plan_patient_id = care_plan_data['resource.subject.reference'].str.replace('urn:uuid:','')
    care_plan_encounter_id = care_plan_data['resource.context.reference'].str.replace('urn:uuid:','')

    result_df = pd.DataFrame({
    'Care Plan Code':care_plan_code,
    'Status':care_plan_status,
    'Care Plan Name':care_plan_name,
    'Patient ID':care_plan_patient_id,
    'Encounter ID':care_plan_encounter_id,
    'Care Plan Start Date':care_plan_start_date,
    'Care Plan End Date':care_plan_end_date
    })

    result_df.to_csv('./output_csv/careplan_data.csv',index=False)


In [38]:
extract_care_plan_data('./health_data/healthcare-datasets/')

## Process care plan data

In [39]:
import pandas as pd
care_plan = pd.read_csv('./output_csv/careplan_data.csv')

In [40]:
care_plan.head()

Unnamed: 0,Care Plan Code,Status,Care Plan Name,Patient ID,Encounter ID,Care Plan Start Date,Care Plan End Date
0,53950000,completed,Respiratory therapy,65d12976-9588-4cfa-a795-216302a2ece9,ddc2fb3c-6ea7-4b66-931b-0a0b4bfd3a75,2015-01-07,2015-11-01
1,134435003,completed,Routine antenatal care,65d12976-9588-4cfa-a795-216302a2ece9,80fd8fc5-4770-4e35-b31c-4d7b1be19c2f,2015-05-08,2015-12-04
2,326051000000105,active,Self care,a195633e-d36d-4abe-92bb-02994e9cb348,fe054716-3d1d-4652-8564-01a86d58cd87,1976-11-24,
3,326051000000105,active,Self care,a195633e-d36d-4abe-92bb-02994e9cb348,fe054716-3d1d-4652-8564-01a86d58cd87,1976-12-07,
4,872781000000100,completed,Musculoskeletal care,a195633e-d36d-4abe-92bb-02994e9cb348,9b133606-85fd-4199-adf4-2fdf90027fed,2016-09-11,2016-10-11


In [43]:
care_plan.describe(include='all')

Unnamed: 0,Care Plan Code,Status,Care Plan Name,Patient ID,Encounter ID,Care Plan Start Date,Care Plan End Date
count,78.0,78,78,78,78,78,40
unique,,2,16,26,52,78,40
top,,completed,Routine antenatal care,7384d82c-2d1e-4595-99e4-f0ae962dddf1,f73f194d-1df1-42f9-8fdf-325cc87b2348,2015-01-07,2015-11-01
freq,,40,14,6,6,1,1
mean,106070700000000.0,,,,,,
std,233594300000000.0,,,,,,
min,47387000.0,,,,,,
25%,134435000.0,,,,,,
50%,305524500.0,,,,,,
75%,699386000.0,,,,,,


In [44]:
care_plan.groupby('Patient ID').agg({
    'Care Plan Code':lambda x: list(x.unique())
})

Unnamed: 0_level_0,Care Plan Code
Patient ID,Unnamed: 1_level_1
00d869e2-6793-4ebf-9340-38bf18d223c4,[385691007]
117335bb-33ff-4897-ac27-af4b3f11135f,"[225358003, 91251008]"
1bd855fa-45c4-4c23-a608-fdeeb3bc2d8b,"[412776001, 182964004]"
21e19329-b572-414b-9cd1-ecf96af4f568,"[326051000000105, 412776001]"
32c72cdb-66bf-4301-895e-16da54ffd72d,"[869761000000107, 225358003]"
3a5357d9-13f3-42de-8fdc-14a1a6cae148,"[326051000000105, 698360004, 386257007]"
509474a0-baf3-48f2-a595-67b6e2b34c87,[53950000]
5265a2b0-1bbc-4993-9042-b8d86ba65f10,"[134435003, 698360004]"
65d12976-9588-4cfa-a795-216302a2ece9,"[53950000, 134435003]"
7384d82c-2d1e-4595-99e4-f0ae962dddf1,"[711282006, 326051000000105, 699728000, 385691..."
