# Patient Data Cleaning

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 05/09/2025   | Prathik | Created   | Created to perform alternative preprocessing and data understanding for patient data| 
| 17/09/2025   | Prathik | New   | Completed cleaning for patient data and pushed to main branch in repo |

# Import All Necessary Libraries

In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
import os
import json
from collections import OrderedDict
import pickle

# Preprocess JSON

In [23]:
# readin json file
path = "../data/raw"
patient = pd.read_json(f"{path}/Patient.ndjson", lines=True)

In [9]:
print(f"keys: {patient.keys()}")

keys: Index(['address', 'birthDate', 'deceasedBoolean', 'extension', 'gender', 'id',
       'identifier', 'meta', 'name', 'resourceType', 'deceasedDateTime'],
      dtype='object')


In [12]:
patient.identifier[0]

[{'system': 'https://bluebutton.cms.gov/resources/variables/bene_id',
  'type': {'coding': [{'code': 'MB',
     'display': 'Member Number',
     'system': 'http://terminology.hl7.org/CodeSystem/v2-0203'}]},
  'value': '-10000000009392'},
 {'system': 'http://hl7.org/fhir/sid/us-mbi',
  'type': {'coding': [{'code': 'MC',
     'display': "Patient's Medicare number",
     'extension': [{'url': 'https://bluebutton.cms.gov/resources/codesystem/identifier-currency',
       'valueCoding': {'code': 'current',
        'display': 'Current',
        'system': 'https://bluebutton.cms.gov/resources/codesystem/identifier-currency'}}],
     'system': 'http://terminology.hl7.org/CodeSystem/v2-0203'}]},
  'value': '1S00E00FR92'}]

In [19]:
# with open('data/raw/Patient.ndjson', 'r') as f:
#     patient_df = pd.read_json(StringIO(f.read()), lines=True)

def flatten_json(nested_json, prefix=''):
    """
    Recursively flattens a nested JSON object or dictionary into a single level.

    Notes:
        - Nested dictionaries and lists are flattened such that keys from deeper levels
          in the hierarchy are concatenated with underscores
        - Lists of dictionaries are handled by appending index numbers to the keys.
        - Non-dict lists are serialized using JSON encoding
        - Returns OrderedDict, a flattened version of the input json, where keys represent
          the nested structure and values are the corresponding data
    """
    out = OrderedDict()
    for key, value in nested_json.items():
        if isinstance(value, dict):
            # Recursively flatten nested dictionaries
            out.update(flatten_json(value, prefix + key + '_'))
        elif isinstance(value, list):
            if len(value) > 0:
                if isinstance(value[0], dict):
                    # Handle list of dictionaries by flattening each item
                    for i, item in enumerate(value):
                        out.update(flatten_json(item, prefix + key + '_' + str(i) + '_'))
                else:
                    # Non-dict lists are serialized into a JSON string
                    out[prefix + key] = json.dumps(value)
            else:
                # Empty lists are serialized as JSON strings
                out[prefix + key] = json.dumps(value)
        else:
            # Base case: key-value pair where value is not a list or dict
            out[prefix + key] = value
    return out

def process_dataframe(df):
    """
    Processes a pandas DataFrame by flattening any JSON-like data (dictionaries or lists)
    present in its columns and converting it into a new DataFrame.

    Notes:
        - The function iterates through each row and flattens any JSON-like data (dictionaries or lists)
        - Non-nested data is left unchanged
        - The resulting DataFrame will contain a combination of original columns and
          additional columns derived from the flattened structure
        - Returns a new pandas DataFrame with the flattened data
    """
    flattened_data = []
    for _, row in df.iterrows():
        flattened_row = {}
        for column, value in row.items():
            if isinstance(value, (dict, list)):
                # Flatten any dictionary or list
                flattened = flatten_json({column: value})
                flattened_row.update(flattened)
            else:
                # Keep non-nested columns unchanged
                flattened_row[column] = value
        flattened_data.append(flattened_row)
    return pd.DataFrame(flattened_data)

# Flatten patient_df
#flat_patient_df = process_dataframe(patient_df)



In [24]:
# Flatten patient_df
flat_patient_df = process_dataframe(patient)

In [21]:
len(flat_patient_df)

5000

In [22]:
# Preprocess other columns
flat_patient_df['patient_medicare_number'] = flat_patient_df['identifier_1_value']
flat_patient_df['patient_number'] = flat_patient_df['identifier_0_value'].str.replace(r'[-]', '', regex=True)
flat_patient_df['patient_first_name'] = flat_patient_df['name_0_given'].str.replace(r'[ \[ \]"]', '', regex=True)
flat_patient_df['patient_last_name'] = flat_patient_df['name_0_family']

In [6]:
##print(f" list of columns in unnested dataset {flat_patient_df.columns}")
for col in flat_patient_df.columns:
    print(col)
print(f"\noriginal json file had 11 columns and now the dataset contains {len(list(flat_patient_df.columns))}")

address_0_state
birthDate
deceasedBoolean
extension_0_url
extension_0_valueCode
extension_1_url
extension_1_valueCoding_code
extension_1_valueCoding_display
extension_1_valueCoding_system
extension_2_extension_0_url
extension_2_extension_0_valueCoding_code
extension_2_extension_0_valueCoding_display
extension_2_extension_0_valueCoding_system
extension_2_extension_1_url
extension_2_extension_1_valueString
extension_2_url
extension_3_url
extension_3_valueDate
gender
id
identifier_0_system
identifier_0_type_coding_0_code
identifier_0_type_coding_0_display
identifier_0_type_coding_0_system
identifier_0_value
identifier_1_system
identifier_1_type_coding_0_code
identifier_1_type_coding_0_display
identifier_1_type_coding_0_extension_0_url
identifier_1_type_coding_0_extension_0_valueCoding_code
identifier_1_type_coding_0_extension_0_valueCoding_display
identifier_1_type_coding_0_extension_0_valueCoding_system
identifier_1_type_coding_0_system
identifier_1_value
meta_lastUpdated
meta_profile
na

## Patient Data

__Columns__

- `address` - Address information of patient
- `birthDate` - Birth date of patient
- `deceasedBoolean` - Boolean flag marking if patient is deceased
- `extension` - Patient ID (dropped)
- `gender` - Gender of patient
- `id` - Patient ID
- `identifier` - Includes Patient and Medicare numbers for patient, and indicates active plans
- `meta` - Date when record was last updated
- `name` - Current status of the coverage (active | cancelled | draft | entered-in-error)
- `resourceType` - Identifier for data type (Patient for this data)
- `deceasedDateTime` - Date and time at which patient deceased(contingent on if patient is deceased, otherwise NaN)

# Print Unflattened Patient Dataframe

In [4]:
#pd.read_json(f"data/Patient.ndjson", lines=True)

# Print Flattened Patient Dataframe

In [15]:
#print(flat_patient_df.to_string())

In [5]:
flat_patient_df['extension_0_valueCode'].nunique()

2

In [7]:
# take a look at the dataset
flat_patient_df.head()

Unnamed: 0,address_0_state,birthDate,deceasedBoolean,extension_0_url,extension_0_valueCode,extension_1_url,extension_1_valueCoding_code,extension_1_valueCoding_display,extension_1_valueCoding_system,extension_2_extension_0_url,...,identifier_2_type_coding_0_extension_0_url,identifier_2_type_coding_0_extension_0_valueCoding_code,identifier_2_type_coding_0_extension_0_valueCoding_display,identifier_2_type_coding_0_extension_0_valueCoding_system,identifier_2_type_coding_0_system,identifier_2_value,patient_medicare_number,patient_number,patient_first_name,patient_last_name
0,22,1953-10-12,0.0,http://hl7.org/fhir/us/core/StructureDefinitio...,248152002,https://bluebutton.cms.gov/resources/variables...,1,White,https://bluebutton.cms.gov/resources/variables...,ombCategory,...,,,,,,,1S00E00FR92,10000000009392,Mina319,Mohr916
1,22,1946-03-01,0.0,http://hl7.org/fhir/us/core/StructureDefinitio...,248152002,https://bluebutton.cms.gov/resources/variables...,1,White,https://bluebutton.cms.gov/resources/variables...,ombCategory,...,,,,,,,1S00E00AK52,10000000000852,Leda374,Kshlerin58
2,22,1947-08-31,0.0,http://hl7.org/fhir/us/core/StructureDefinitio...,248153007,https://bluebutton.cms.gov/resources/variables...,2,Black,https://bluebutton.cms.gov/resources/variables...,ombCategory,...,,,,,,,1S00E00FU79,10000000009579,Jeromy156,Morissette863
3,22,1952-01-13,0.0,http://hl7.org/fhir/us/core/StructureDefinitio...,248153007,https://bluebutton.cms.gov/resources/variables...,5,Hispanic,https://bluebutton.cms.gov/resources/variables...,ombCategory,...,,,,,,,1S00E00CY77,10000000003977,Sergio619,Vanegas191
4,22,1943-10-05,0.0,http://hl7.org/fhir/us/core/StructureDefinitio...,248153007,https://bluebutton.cms.gov/resources/variables...,1,White,https://bluebutton.cms.gov/resources/variables...,ombCategory,...,,,,,,,1S00E00AY95,10000000001995,Fernando603,Aufderhar910


In [25]:
# save to pickle
flat_patient_df.to_pickle("../data/clean/patient.pkl")