# Cleaning Claim Response Data
# 01_claim_response_cleaning

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 18/09/2025   | Adrienne | Created   | Created to flatten data | 
|    | |   | |

# Content

* [Introduction](#introduction)

# Preprocess JSON

## Claims

__Columns__

- `contained` - birthDate, extension, gender, id, identifier, name, resourceType, id, identifier, resourceType

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
import json_lines
import copy

In [25]:
path = "../data/raw"
#claim_response = pd.read_json(f"{path}/ClaimResponse.ndjson", lines=True)
claim_response = pd.read_json(f"{path}/ClaimResponse.ndjson", lines=True, nrows=10)

The claims file has very complex nesting.  The code below prints out one row of data, so the structure can easily be seen.

In [None]:
for key, value in claim_response.head(1).items():
    print(f"key: {key}")
    for item in value:
            if isinstance(item, dict):
                for key, value in value.items():
                    print(f"\tkey: ({key}) value ({value})\n")
            #print(item)
            elif isinstance(item, list):
                for i in item:
                    if isinstance(i, dict):
                        for key, value in i.items():
                            if isinstance(value, list):
                                for y in value:
                                    print(f"\t\tkey: {key} list: ({y})\n")
                            else:
                                print(f"\t\tkey: ({key}) value ({value})\n")
                    else:
                        print(f"{i}\n")
            else:
                print(f"\tvalue: {item}\n")


key: contained
		key: (birthDate) value (1944-05-25)

		key: extension list: ({'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-sex', 'valueCode': '248152002'})

		key: (gender) value (female)

		key: (id) value (patient)

		key: identifier list: ({'system': 'http://hl7.org/fhir/sid/us-mbi', 'type': {'coding': [{'code': 'MC', 'display': "Patient's Medicare Number", 'system': 'http://terminology.hl7.org/CodeSystem/v2-0203'}]}, 'value': '1S00E00JK17'})

		key: name list: ({'family': 'Wiza601', 'given': ['Patrina117'], 'text': 'Patrina117 Wiza601 ([max 10 chars of first], [max 15 chars of last])'})

		key: (resourceType) value (Patient)

key: created
	value: 2025-09-03T22:20:31+00:00

key: extension
		key: (url) value (https://bluebutton.cms.gov/resources/variables/fiss/curr-status)

		key: (valueCoding) value ({'code': 'A', 'system': 'https://bluebutton.cms.gov/resources/variables/fiss/curr-status'})

		key: (url) value (https://bluebutton.cms.gov/resources/variables/fiss/

### Function to Flatten a Column

In [12]:
def flatten_col(df, col):
       # if the column contains a dictionary, that is easy to create columns from the key-value pairs
        if df[col].apply(type).eq(dict).any():
            print(f"dict {col}")
            # original column is dropped
            temp = df[col].apply(pd.Series)
            temp = temp.add_prefix(str(col) + '_')
            print(temp.columns)
            df = pd.concat([df, temp], axis=1).drop(col, axis=1)
            
        # if the column is a list, then the list needs to be broken into columns
        elif df[col].apply(type).eq(list).any():
            print(f"list {col}")
            # if all items extracted from the list are dictionaries, then each dictionary is unnested into columns
            # other wise the column is left as-is.  Original column will not be deleted
            for i in range(0, df[col].str.len().unique()[0]):
                # to make the code more readable, creating a next col variable for the item in the list
                next_col = df[col].apply(lambda x:x[i])
                    
                    
                if next_col.apply(type).eq(dict).all():
                    temp = next_col.apply(pd.Series)
                    temp = temp.add_prefix(str(col) + '_')
                    print(temp.columns)
                     
                    df = pd.concat([df, temp], axis=1)
                                        
                else:
                    print('list contains mixed elements.  leaving alone for now')    
        return df
            

The code below unnests two levels using the function above and then specific code is written to unnest other columns.  Some columns have not been fully unnested becuase if they don't contain data we are interested in using, we will skip over those columns/fields. 

In [26]:
claim_response = flatten_col(claim_response, 'contained')
claim_response = flatten_col(claim_response, 'created')
claim_response = flatten_col(claim_response, 'extension')
claim_response = flatten_col(claim_response, 'id')
claim_response = flatten_col(claim_response, 'identifier')
claim_response = flatten_col(claim_response, 'insurer')
claim_response = flatten_col(claim_response, 'meta')
claim_response = flatten_col(claim_response, 'outcome')
claim_response = flatten_col(claim_response, 'patient')
claim_response = flatten_col(claim_response, 'request')
claim_response = flatten_col(claim_response, 'resourceType')
claim_response = flatten_col(claim_response, 'status')
claim_response = flatten_col(claim_response, 'type')
claim_response = flatten_col(claim_response, 'use')

list contained
Index(['contained_birthDate', 'contained_extension', 'contained_gender',
       'contained_id', 'contained_identifier', 'contained_name',
       'contained_resourceType'],
      dtype='object')
list extension
Index(['extension_url', 'extension_valueCoding'], dtype='object')
Index(['extension_url', 'extension_valueDate'], dtype='object')
Index(['extension_url', 'extension_valueDate'], dtype='object')
list identifier
Index(['identifier_system', 'identifier_type', 'identifier_value'], dtype='object')
dict insurer
Index(['insurer_identifier'], dtype='object')
dict meta
Index(['meta_lastUpdated'], dtype='object')
dict patient
Index(['patient_reference'], dtype='object')
dict request
Index(['request_reference'], dtype='object')
dict type
Index(['type_coding'], dtype='object')


### Specific Column Processing

In [27]:
# Preprocess other columns
claim_response['contained_identifer_patient_medicare_number'] = pd.DataFrame(claim_response['contained_identifier']).iloc[:,0].apply(lambda x: x[0]['value'])
# text version of name needs some extra processing 
claim_response['contained_name_family'] = claim_response['contained_name'].apply(lambda x: x[0]['family'])
claim_response['contained_name_given'] = claim_response['contained_name'].apply(lambda x: x[0]['given'][0])

# TODO item_revenue supportingInfo

In [28]:
print(f" list of columns in unnested dataset {list(claim_response.columns)}")
print(f" original json file had 14 columns and now the dataset contains {len(list(claim_response.columns))}")

 list of columns in unnested dataset ['contained', 'created', 'extension', 'id', 'identifier', 'outcome', 'resourceType', 'status', 'use', 'contained_birthDate', 'contained_extension', 'contained_gender', 'contained_id', 'contained_identifier', 'contained_name', 'contained_resourceType', 'extension_url', 'extension_valueCoding', 'extension_url', 'extension_valueDate', 'extension_url', 'extension_valueDate', 'identifier_system', 'identifier_type', 'identifier_value', 'insurer_identifier', 'meta_lastUpdated', 'patient_reference', 'request_reference', 'type_coding', 'contained_identifer_patient_medicare_number', 'contained_name_family', 'contained_name_given']
 original json file had 14 columns and now the dataset contains 33


In [29]:
# take a look at the dataset
claim_response.head()

Unnamed: 0,contained,created,extension,id,identifier,outcome,resourceType,status,use,contained_birthDate,...,identifier_type,identifier_value,insurer_identifier,meta_lastUpdated,patient_reference,request_reference,type_coding,contained_identifer_patient_medicare_number,contained_name_family,contained_name_given
0,"[{'birthDate': '1944-05-25', 'extension': [{'u...",2025-09-03T22:20:31+00:00,[{'url': 'https://bluebutton.cms.gov/resources...,f-LTEwMDAwMDAzNTUxNzU5,[{'system': 'https://bluebutton.cms.gov/resour...,queued,ClaimResponse,active,claim,1944-05-25,...,"{'coding': [{'code': 'uc', 'display': 'Unique ...",-100125087,{'value': 'CMS'},2023-05-11T21:17:37.364+00:00,#patient,Claim/f-LTEwMDAwMDAzNTUxNzU5,"[{'code': 'institutional', 'display': 'Institu...",1S00E00JK17,Wiza601,Patrina117
1,"[{'birthDate': '1944-05-25', 'extension': [{'u...",2025-09-03T22:20:31+00:00,[{'url': 'https://bluebutton.cms.gov/resources...,f-LTEwMDAwMDAzNTUxNzY0,[{'system': 'https://bluebutton.cms.gov/resour...,queued,ClaimResponse,active,claim,1944-05-25,...,"{'coding': [{'code': 'uc', 'display': 'Unique ...",-100125090,{'value': 'CMS'},2023-05-11T21:17:36.876+00:00,#patient,Claim/f-LTEwMDAwMDAzNTUxNzY0,"[{'code': 'institutional', 'display': 'Institu...",1S00E00JK17,Wiza601,Patrina117
2,"[{'birthDate': '1944-05-25', 'extension': [{'u...",2025-09-03T22:20:31+00:00,[{'url': 'https://bluebutton.cms.gov/resources...,f-LTEwMDAwMDAzNTUxNzY4,[{'system': 'https://bluebutton.cms.gov/resour...,queued,ClaimResponse,active,claim,1944-05-25,...,"{'coding': [{'code': 'uc', 'display': 'Unique ...",-100125092,{'value': 'CMS'},2023-05-11T21:17:37.098+00:00,#patient,Claim/f-LTEwMDAwMDAzNTUxNzY4,"[{'code': 'institutional', 'display': 'Institu...",1S00E00JK17,Wiza601,Patrina117
3,"[{'birthDate': '1944-05-25', 'extension': [{'u...",2025-09-03T22:20:31+00:00,[{'url': 'https://bluebutton.cms.gov/resources...,f-LTEwMDAwMDAzNTUxNzc2,[{'system': 'https://bluebutton.cms.gov/resour...,queued,ClaimResponse,active,claim,1944-05-25,...,"{'coding': [{'code': 'uc', 'display': 'Unique ...",-100125096,{'value': 'CMS'},2023-05-11T21:17:37.145+00:00,#patient,Claim/f-LTEwMDAwMDAzNTUxNzc2,"[{'code': 'institutional', 'display': 'Institu...",1S00E00JK17,Wiza601,Patrina117
4,"[{'birthDate': '1944-05-25', 'extension': [{'u...",2025-09-03T22:20:31+00:00,[{'url': 'https://bluebutton.cms.gov/resources...,f-LTEwMDAwMDAzNTUxNzc4,[{'system': 'https://bluebutton.cms.gov/resour...,queued,ClaimResponse,active,claim,1944-05-25,...,"{'coding': [{'code': 'uc', 'display': 'Unique ...",-100125098,{'value': 'CMS'},2023-05-11T21:17:37.099+00:00,#patient,Claim/f-LTEwMDAwMDAzNTUxNzc4,"[{'code': 'institutional', 'display': 'Institu...",1S00E00JK17,Wiza601,Patrina117


In [30]:
# save to pickle
claim_response.to_pickle("../data/clean/claim_response.pkl")