# Patient Data Cleaning
# 01_patient_data_cleaning

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 05/09/2025   | Prathik | Created   | Created to perform alternative preprocessing and data understanding for patient data| 
| 17/09/2025   | Prathik | New   | Completed cleaning for patient data and pushed to main branch in repo |
| 27/09/2025 | Adrienne | Update | Added some code that was used to further clean Patient

# Import All Necessary Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
import os
import json
from collections import OrderedDict
import pickle

# Preprocess JSON

In [None]:
# Readin JSON
with open('data/raw/Patient.ndjson', 'r') as f:
    patient_df = pd.read_json(StringIO(f.read()), lines=True)

In [None]:
def flatten_json(nested_json, prefix=''):
    """
    Recursively flattens a nested JSON object or dictionary into a single level.

    Notes:
        - Nested dictionaries and lists are flattened such that keys from deeper levels
          in the hierarchy are concatenated with underscores
        - Lists of dictionaries are handled by appending index numbers to the keys.
        - Non-dict lists are serialized using JSON encoding
        - Returns OrderedDict, a flattened version of the input json, where keys represent
          the nested structure and values are the corresponding data
    """
    out = OrderedDict()
    for key, value in nested_json.items():
        if isinstance(value, dict):
            # Recursively flatten nested dictionaries
            out.update(flatten_json(value, prefix + key + '_'))
        elif isinstance(value, list):
            if len(value) > 0:
                if isinstance(value[0], dict):
                    # Handle list of dictionaries by flattening each item
                    for i, item in enumerate(value):
                        out.update(flatten_json(item, prefix + key + '_' + str(i) + '_'))
                else:
                    # Non-dict lists are serialized into a JSON string
                    out[prefix + key] = json.dumps(value)
            else:
                # Empty lists are serialized as JSON strings
                out[prefix + key] = json.dumps(value)
        else:
            # Base case: key-value pair where value is not a list or dict
            out[prefix + key] = value
    return out

def process_dataframe(df):
    """
    Processes a pandas DataFrame by flattening any JSON-like data (dictionaries or lists)
    present in its columns and converting it into a new DataFrame.

    Notes:
        - The function iterates through each row and flattens any JSON-like data (dictionaries or lists)
        - Non-nested data is left unchanged
        - The resulting DataFrame will contain a combination of original columns and
          additional columns derived from the flattened structure
        - Returns a new pandas DataFrame with the flattened data
    """
    flattened_data = []
    for _, row in df.iterrows():
        flattened_row = {}
        for column, value in row.items():
            if isinstance(value, (dict, list)):
                # Flatten any dictionary or list
                flattened = flatten_json({column: value})
                flattened_row.update(flattened)
            else:
                # Keep non-nested columns unchanged
                flattened_row[column] = value
        flattened_data.append(flattened_row)
    return pd.DataFrame(flattened_data)



In [None]:
# Flatten patient_df
flat_patient_df = process_dataframe(patient_df)

## Patient Data

__Columns__

- `address` - Address information of patient
- `birthDate` - Birth date of patient
- `deceasedBoolean` - Boolean flag marking if patient is deceased
- `extension` - Patient ID (dropped)
- `gender` - Gender of patient
- `id` - Patient ID
- `identifier` - Includes Patient and Medicare numbers for patient, and indicates active plans
- `meta` - Date when record was last updated
- `name` - Current status of the coverage (active | cancelled | draft | entered-in-error)
- `resourceType` - Identifier for data type (Patient for this data)
- `deceasedDateTime` - Date and time at which patient deceased(contingent on if patient is deceased, otherwise NaN)

# Print Unflattened Patient Dataframe

In [4]:
pd.read_json(f"data/Patient.ndjson", lines=True)

Unnamed: 0,address,birthDate,deceasedBoolean,extension,gender,id,identifier,meta,name,resourceType,deceasedDateTime
0,[{'state': '22'}],1953-10-12,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,female,-10000000009392,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:02.459+00:00...,"[{'family': 'Mohr916', 'given': ['Mina319'], '...",Patient,
1,"[{'postalCode': '01420', 'state': '22'}]",1946-03-01,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,female,-10000000000852,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:00.067+00:00...,"[{'family': 'Kshlerin58', 'given': ['Leda374']...",Patient,
2,"[{'postalCode': '02188', 'state': '22'}]",1947-08-31,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,male,-10000000009579,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:02.729+00:00...,"[{'family': 'Morissette863', 'given': ['Jeromy...",Patient,
3,"[{'postalCode': '01602', 'state': '22'}]",1952-01-13,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,male,-10000000003977,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:00.256+00:00...,"[{'family': 'Vanegas191', 'given': ['Sergio619...",Patient,
4,[{'state': '22'}],1943-10-05,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,male,-10000000001995,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:00.113+00:00...,"[{'family': 'Aufderhar910', 'given': ['Fernand...",Patient,
...,...,...,...,...,...,...,...,...,...,...,...
4995,[{'state': '22'}],1955-04-21,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,male,-10000000003689,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:00.242+00:00...,"[{'family': 'Gutkowski940', 'given': ['Henry76...",Patient,
4996,[{'state': '22'}],1954-06-20,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,female,-10000000005779,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:00.536+00:00...,"[{'family': 'Mohr916', 'given': ['Marisa391'],...",Patient,
4997,"[{'postalCode': '02115', 'state': '22'}]",1945-09-02,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,female,-10000000004518,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:00.361+00:00...,"[{'family': 'Rau926', 'given': ['Chan58'], 'us...",Patient,
4998,[{'state': '22'}],1941-11-03,0.0,[{'url': 'http://hl7.org/fhir/us/core/Structur...,male,-10000000008835,[{'system': 'https://bluebutton.cms.gov/resour...,{'lastUpdated': '2021-08-17T17:43:01.961+00:00...,"[{'family': 'Mraz590', 'given': ['Emmanuel930'...",Patient,


# Print Flattened Patient Dataframe

In [3]:
print(flat_patient_df.to_string())

     address_0_state   birthDate  deceasedBoolean                                              extension_0_url extension_0_valueCode                                      extension_1_url extension_1_valueCoding_code extension_1_valueCoding_display                       extension_1_valueCoding_system extension_2_extension_0_url extension_2_extension_0_valueCoding_code extension_2_extension_0_valueCoding_display           extension_2_extension_0_valueCoding_system extension_2_extension_1_url extension_2_extension_1_valueString                                               extension_2_url                                          extension_3_url extension_3_valueDate  gender              id                                     identifier_0_system identifier_0_type_coding_0_code identifier_0_type_coding_0_display              identifier_0_type_coding_0_system identifier_0_value             identifier_1_system identifier_1_type_coding_0_code identifier_1_type_coding_0_display                  

In [None]:
# Preprocess other columns
flat_patient_df['patient_medicare_number'] = flat_patient_df['identifier_1_value']
flat_patient_df['patient_number'] = flat_patient_df['identifier_0_value'].str.replace(r'[-]', '', regex=True)
flat_patient_df['patient_first_name'] = flat_patient_df['name_0_given'].str.replace(r'[ \[ \]"]', '', regex=True)
flat_patient_df['patient_last_name'] = flat_patient_df['name_0_family']

2

In [None]:
# save to pickle
flat_patient_df.to_pickle("../data/clean/patient.pkl")