# Cleaning Coverage and EOB Data

# 01_coverage_eob_data_cleaning 

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 01/09/2025   | Martin | Created   | Created to perform alternative preprocessing and data understanding | 
| 03/09/2025   | Martin | New   | Completed cleaning for coverage branch |
| 04/09/2025   | Martin | New   | Started processing EOB. Missing transformation for some columns |
| 05/09/2025   | Martin | Updated   | Completed EOB processing. PR made to main. Added processed data to Gdrive |
| 24/09/2025   | Martin | Updated   | Additional processing for `supportingInfo` column on EOB |
| 26/09/2025   | Martin | Updated   | Reran for 2025-09-22 data. Compared between both files |
| 13/10/2025   | Martin | Updated   | Code cleanup |

# Content

* [Introduction](#introduction)
* [Preprocess JSON](#preprocess-json)
  * [Coverage](#coverage)
  * [Explanation of Benefits](#explanation-of-benefits)

# Introduction

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime

# Preprocess JSON

## Coverage

__Columns__

- `beneficiary` - References the Patient ID
- `class` - Medical coverage type containing group and plan info
- `extension` - Additional details from CMS Blue Button (dropped)
- `id` - Patient ID (dropped)
- `meta` - Date when the record was last updated
- `payor` - Issuer of the policy
- `relationship` - Beneficiary relationship to the subscriber ([refer here](https://hl7.org/fhir/R4/valueset-subscriber-relationship.html))
- `resourceType` - Identifier for data type (Coverage)
- `status` - Current status of the coverage (active | cancelled | draft | entered-in-error)
- `subscriberId` - ID assigned to the subscriber
- `type` - A code specifying the particular kind of Act that the Act-instance represents within its class ([refer here](https://terminology.hl7.org/6.5.0/ValueSet-v3-ActCode.html))

In [3]:
path = "../data/raw"
coverage = pd.read_json(f"{path}/Coverage.ndjson", lines=True)
# coverage_old = pd.read_json(f"../data/raw/Coverage.ndjson", lines=True)

In [4]:
# Processing functions
def process_beneficiary(item):
  return int(item['reference'].replace('Patient/-', ''))

def process_class(item):
  return {
    'coverageGroup': item[0]['value'],
    'coveragePlan': item[1]['value']
  }

In [5]:
# ========== Processing Coverage ==========
def coverage_preprocessing(coverage):
  # beneficiary
  coverage['id'] = coverage['beneficiary'].apply(lambda x: process_beneficiary(x))

  # class
  cov = []
  for i in coverage['class']:
    cov.append(process_class(i))
  pclass = pd.DataFrame.from_records(cov)
  coverage = pd.concat([coverage, pclass], axis=1)
  coverage = coverage.drop('class', axis=1)

  # extention
  coverage = coverage.drop('extension', axis=1)

  # # id
  # coverage = coverage.drop('id', axis=1)

  # meta
  coverage['lastUpdated'] = coverage['meta'].apply(lambda x: datetime.strptime(x['lastUpdated'], '%Y-%m-%dT%H:%M:%S.%f%z'))
  coverage = coverage.drop('meta', axis=1)

  # Payor
  coverage['payor'] = coverage['payor'].apply(lambda x: x[0]['identifier']['value'])

  # Relationship
  coverage['relationship'] = coverage['relationship'].apply(lambda x: x['coding'][0]['code'])

  # Type
  coverage['actCode'] = coverage['type'].apply(lambda x: x['coding'][0]['code'])
  coverage = coverage.drop('type', axis=1)
  return coverage

In [6]:
coverage.head()

Unnamed: 0,beneficiary,class,extension,id,meta,payor,relationship,resourceType,status,subscriberId,type
0,{'reference': 'Patient/-10000000008492'},"[{'type': {'coding': [{'code': 'group', 'displ...",[{'url': 'https://bluebutton.cms.gov/resources...,part-a--10000000008492,{'lastUpdated': '2021-08-17T17:43:01.432+00:00...,[{'identifier': {'value': 'Centers for Medicar...,"{'coding': [{'code': 'self', 'display': 'Self'...",Coverage,active,1S00E00FF92,"{'coding': [{'code': 'SUBSIDIZ', 'system': 'ht..."
1,{'reference': 'Patient/-10000000008492'},"[{'type': {'coding': [{'code': 'group', 'displ...",[{'url': 'https://bluebutton.cms.gov/resources...,part-b--10000000008492,{'lastUpdated': '2021-08-17T17:43:01.432+00:00...,[{'identifier': {'value': 'Centers for Medicar...,"{'coding': [{'code': 'self', 'display': 'Self'...",Coverage,active,1S00E00FF92,"{'coding': [{'code': 'SUBSIDIZ', 'system': 'ht..."
2,{'reference': 'Patient/-10000000008492'},"[{'type': {'coding': [{'code': 'group', 'displ...",[{'url': 'https://bluebutton.cms.gov/resources...,part-c--10000000008492,{'lastUpdated': '2021-08-17T17:43:01.432+00:00...,[{'identifier': {'value': 'Centers for Medicar...,"{'coding': [{'code': 'self', 'display': 'Self'...",Coverage,active,1S00E00FF92,"{'coding': [{'code': 'SUBSIDIZ', 'system': 'ht..."
3,{'reference': 'Patient/-10000000008492'},"[{'type': {'coding': [{'code': 'group', 'displ...",[{'url': 'https://bluebutton.cms.gov/resources...,part-d--10000000008492,{'lastUpdated': '2021-08-17T17:43:01.432+00:00...,[{'identifier': {'value': 'Centers for Medicar...,"{'coding': [{'code': 'self', 'display': 'Self'...",Coverage,active,1S00E00FF92,"{'coding': [{'code': 'SUBSIDIZ', 'system': 'ht..."
4,{'reference': 'Patient/-10000000009461'},"[{'type': {'coding': [{'code': 'group', 'displ...",[{'url': 'https://bluebutton.cms.gov/resources...,part-a--10000000009461,{'lastUpdated': '2021-08-17T17:43:02.557+00:00...,[{'identifier': {'value': 'Centers for Medicar...,"{'coding': [{'code': 'self', 'display': 'Self'...",Coverage,active,1S00E00FT61,"{'coding': [{'code': 'SUBSIDIZ', 'system': 'ht..."


In [7]:
coverage = coverage_preprocessing(coverage)
# coverage_old = coverage_preprocessing(coverage_old)

In [None]:
# # Compare the set of IDs between old and new coverage files
# print(f"Number of unique Patient IDs in old coverage: {len(coverage_old['id'].unique())}")
# print(f"Number of unique Patient IDs in 2025-09-22 coverage: {len(coverage['id'].unique())}")
# print()
# print(f"Overlapping patient IDs: {len(set(coverage['id']).intersection(set(coverage_old['id'])))}")

Number of unique Patient IDs in old coverage: 2000
Number of unique Patient IDs in 2025-09-22 coverage: 2000

Overlapping patient IDs: 0


## Explanation of Benefits

Columns marked with ❓ are left as their original format

- `benefitBalance` - Series of benefits included in the insurance coverage and the amount covered
  - [category](https://www.hl7.org/fhir/2022Sep/valueset-ex-benefitcategory.html)
- `billablePeriod` - Start and end date of the billable period
  - Type of claim ([Refer Here](https://bluebutton.cms.gov/resources/variables/claim_query_cd/))
- ❓`careTeam` - Members of care team and their identifiers
- `contained` - Information on providers PRN and NPI code
- `created` - Date the entry was created on
- ❓`diagnosis` - Pertinent diagnosis information (My assumption is that these are the illnesses that is covered under the policy that they have)
- ❓`extension` - Additional information
- `facility` - Serving facility, code + name
- `id` - EOB id. Split into claim subtype and the id
- `identifier` -  Claim IDs
- `insurance` - Contains details aboue the insured amount. Split into coveragePart and coverageId
- `insurer` - ID of the issurer
- ❓`item`: Product or service provided 
- `meta`: Date when item was last updated
- `patient` - Connection by Patient ID
- `payment` - Payment made by the user
- `provider` - ID of provider
- `total` - Amount reinbursed from policy
- `type` - Claim and EOB types
  - [claim types](https://www.hl7.org/fhir/2022Sep/valueset-claim-type.html)
  - [EOB types](https://bluebutton.cms.gov/resources/codesystem/eob-type/)

In [8]:
path = "../data/raw"
eob = pd.read_json(f"{path}/ExplanationOfBenefit.ndjson", lines=True)
# eob_old = pd.read_json("../data/raw/ExplanationOfBenefit.ndjson", lines=True)

In [9]:
eob.head()

Unnamed: 0,benefitBalance,billablePeriod,careTeam,contained,created,diagnosis,extension,facility,id,identifier,...,provider,resourceType,status,subType,supportingInfo,total,type,use,disposition,procedure
0,"[{'category': {'coding': [{'code': '1', 'displ...","{'end': '2011-08-07', 'extension': [{'url': 'h...",[{'provider': {'identifier': {'type': {'coding...,"[{'active': True, 'id': 'provider-org', 'ident...",2025-09-03T22:07:58+00:00,[{'diagnosisCodeableConcept': {'coding': [{'co...,[{'url': 'https://bluebutton.cms.gov/resources...,{'extension': [{'url': 'https://bluebutton.cms...,inpatient--10000002646806,[{'system': 'https://bluebutton.cms.gov/resour...,...,{'reference': '#provider-org'},ExplanationOfBenefit,active,"{'coding': [{'code': 'inpatient', 'system': 'h...",[{'category': {'coding': [{'code': 'admissionp...,"[{'amount': {'currency': 'USD', 'value': 129.1...","{'coding': [{'code': '60', 'display': 'Inpatie...",claim,,
1,"[{'category': {'coding': [{'code': '1', 'displ...","{'end': '2020-12-05', 'extension': [{'url': 'h...",[{'provider': {'identifier': {'type': {'coding...,"[{'active': True, 'id': 'provider-org', 'ident...",2025-09-03T22:07:58+00:00,[{'diagnosisCodeableConcept': {'coding': [{'co...,[{'url': 'https://bluebutton.cms.gov/resources...,{'extension': [{'url': 'https://bluebutton.cms...,inpatient--10000002646833,[{'system': 'https://bluebutton.cms.gov/resour...,...,{'reference': '#provider-org'},ExplanationOfBenefit,active,"{'coding': [{'code': 'inpatient', 'system': 'h...",[{'category': {'coding': [{'code': 'admissionp...,"[{'amount': {'currency': 'USD', 'value': 134.4...","{'coding': [{'code': '60', 'display': 'Inpatie...",claim,,
2,"[{'category': {'coding': [{'code': '1', 'displ...","{'end': '1973-09-23', 'extension': [{'url': 'h...",[{'provider': {'identifier': {'type': {'coding...,"[{'active': True, 'id': 'provider-org', 'ident...",2025-09-03T22:07:58+00:00,,[{'url': 'https://bluebutton.cms.gov/resources...,{'extension': [{'url': 'https://bluebutton.cms...,outpatient--10000002646839,[{'system': 'https://bluebutton.cms.gov/resour...,...,{'reference': '#provider-org'},ExplanationOfBenefit,active,"{'coding': [{'code': 'outpatient', 'system': '...",[{'category': {'coding': [{'code': 'clmrecvdda...,"[{'amount': {'currency': 'USD', 'value': 218.0...","{'coding': [{'code': '40', 'display': 'Hospita...",claim,,
3,"[{'category': {'coding': [{'code': '1', 'displ...","{'end': '1978-10-08', 'extension': [{'url': 'h...",[{'provider': {'identifier': {'type': {'coding...,"[{'active': True, 'id': 'provider-org', 'ident...",2025-09-03T22:07:58+00:00,,[{'url': 'https://bluebutton.cms.gov/resources...,{'extension': [{'url': 'https://bluebutton.cms...,outpatient--10000002646843,[{'system': 'https://bluebutton.cms.gov/resour...,...,{'reference': '#provider-org'},ExplanationOfBenefit,active,"{'coding': [{'code': 'outpatient', 'system': '...",[{'category': {'coding': [{'code': 'clmrecvdda...,"[{'amount': {'currency': 'USD', 'value': 218.0...","{'coding': [{'code': '40', 'display': 'Hospita...",claim,,
4,"[{'category': {'coding': [{'code': '1', 'displ...","{'end': '1992-04-26', 'extension': [{'url': 'h...",[{'provider': {'identifier': {'type': {'coding...,"[{'active': True, 'id': 'provider-org', 'ident...",2025-09-03T22:07:58+00:00,[{'diagnosisCodeableConcept': {'coding': [{'co...,[{'url': 'https://bluebutton.cms.gov/resources...,{'extension': [{'url': 'https://bluebutton.cms...,outpatient--10000002646848,[{'system': 'https://bluebutton.cms.gov/resour...,...,{'reference': '#provider-org'},ExplanationOfBenefit,active,"{'coding': [{'code': 'outpatient', 'system': '...",[{'category': {'coding': [{'code': 'clmrecvdda...,"[{'amount': {'currency': 'USD', 'value': 218.0...","{'coding': [{'code': '40', 'display': 'Hospita...",claim,,


Functions below process individual column items, each one processes a dictionary individually. Therefore no docstrings are added to them

In [11]:
# Processing Functions
def process_benefitBalance(item):
  bb = {
    "benefitCategory": np.nan,
    "benefitFinancials": np.nan
  }
  if not isinstance(item, float):
    bb['benefitCategory'] = item[0]['category']['coding'][0]['code']
    financials = []
    for fin in item[0]['financial']:
      try:
        financial = {
          'type': fin['type']['coding'][0]['display'],
          'usedMoneyCurrency': fin['usedMoney']['currency'],
          'usedMoneyAmount': fin['usedMoney']['value']
        }
        financials.append(financial)
      except:
        financials.append(np.nan)
    bb['benefitFinancials'] = financials
  return bb

def process_billablePeriod(item):
  if "extension" in item.keys():
    return {
      'billablePeriodStart': item['start'],
      'billablePeriodEnd': item['end'],
      'ClaimType': item['extension'][0]['valueCoding']['code']
    }
  return {
    'billablePeriodStart': item['start'],
    'billablePeriodEnd': item['end'],
    'ClaimType': np.nan
  }

def process_contained(item):
  if pd.isnull(item):
    return {
      "active": np.nan,
      "PRNCode": np.nan,
      "NPICode": np.nan
    }
  
  if len(item[0]['identifier']) == 2:
    try:
      return {
        "active": item[0]['active'],
        "PRNCode": item[0]['identifier'][0]['value'],
        "NPICode": item[0]['identifier'][1]['value']
      }
    except:
      return {
        "active": item[0]['active'],
        "PRNCode": item[0]['identifier'][0]['value'],
        "NPICode": np.nan
      }
  else:
    return {
      "active": item[0]['active'],
      "PRNCode": item[0]['identifier'][0]['value'],
      "NPICode": np.nan
    }

def process_facility(item):
  if not isinstance(item, float):
    return {
      'facilityId': item['extension'][0]['valueCoding']['code'],
      'faciltyType': item['extension'][0]['valueCoding']['display']
    }
  else:
    return {
      'facilityId': np.nan,
      'faciltyType': np.nan
    }

def process_identifier(item):
  identifier = {
    'claimId_1': np.nan,
    'claimId_2': np.nan
  }
  if not isinstance(item, float):
    identifier['claimId_1'] = item[0]['value']
    identifier['claimId_2'] = item[1]['value']
  return identifier


def process_insurance(item):
  mapper = {
    'a': 'Part A',
    'b': 'Part B',
    'd': 'Part D'
  }

  s = item[0]['coverage']['reference'].split('-')
  part = mapper[s[1]]
  cov_id = s[-1]
  return part, cov_id

def process_payment(item):
  payment = {
    "paymentCurrency": np.nan,
    "paymentAmount": np.nan,
    "paymentDate": np.nan
  }
  if "amount" in item.keys():
    payment["paymentCurrency"] = item['amount']['currency'],
    payment["paymentAmount"] = item['amount']['value']
  else:
    payment["paymentDate"] = item['date']
  
  return payment
  
def process_provider(item):
  if "identifier" in item.keys():
    return item['identifier']['value']
  return np.nan

def process_total(item):
  total = {
    "totalChargeType": np.nan,
    "totalChargeCurrency": item[0]['amount']['currency'],
    "totalChargeAmount": item[0]['amount']['value']
  } 

  if len(item) == 1:
    if len(item[0]['category']['coding']) == 1:
      total['totalChargeType'] = 'Drug Cost'
    else:
      total['totalChargeType'] = 'Claim Total Charge Amount'
  else:
    total['totalChargeType'] = 'Drug Cost'

  return total

def process_type(item):
  return {
    "claimType": item['coding'][-1]['code'],
    "eobType": item['coding'][-2]['code']
  }

def process_supportingInfo(col):
  items = []
  for entry in col:
    if not isinstance(entry, float):
      components = []
      for comp in entry:
        coding_max = len(comp['category']['coding']) - 1
        display = comp['category']['coding'][coding_max]['display']
        if 'valueQuantity' in comp:
          item = {display: comp['valueQuantity']['value']}
        elif 'code' in comp:
          try:
            item = {display: comp['code']['coding'][0]['code']}
          except:
            pass
        else:
          item = display
        components.append(item)
      items.append(components)

  # Make into dataframe to join
  l = pd.DataFrame({
    'l': items
  })
  l = pd.DataFrame(l.l.values.tolist())
  l.columns = [f"supportingInfo_{i}" for i in range(len(l.columns))]

  return l

In [10]:
# ========== Processing Explanation of Benefits ==========
def eob_preprocessing(eob):
  # benefitBalance
  bb = eob['benefitBalance'].apply(lambda x: process_benefitBalance(x))
  bb = pd.DataFrame.from_records(bb)
  eob = pd.concat([eob, bb], axis=1)
  eob.drop('benefitBalance', axis=1, inplace=True)

  # billablePeriod
  bp = eob['billablePeriod'].apply(lambda x: process_billablePeriod(x))
  bp = pd.DataFrame.from_records(bp)
  eob = pd.concat([eob, bp], axis=1)
  eob.drop('billablePeriod', axis=1, inplace=True)

  # careTeam
  # contained
  # diagnosis
  # extension
  # facility

  # id
  eob['subType'] = eob['id'].str.split('-').str[0]
  eob['id'] = eob['id'].str.split('-').str[-1]

  # identifier
  identifier = eob['identifier'].apply(lambda x: process_identifier(x))
  identifier = pd.DataFrame.from_records(identifier)
  eob = pd.concat([eob, identifier], axis=1)
  eob.drop('identifier', axis=1, inplace=True)

  # insurance
  eob[['coveragePart', 'coverageId']] = eob.apply(lambda x: process_insurance(x['insurance']), result_type='expand', axis='columns') 

  # insurer
  eob['insurer'] = eob['insurer'].apply(lambda x: x['identifier']['value'])

  # item

  # meta
  eob['lastUpdated'] = eob['meta'].apply(lambda x: datetime.strptime(x['lastUpdated'], '%Y-%m-%dT%H:%M:%S.%f%z'))

  # patient
  eob['patient'] = eob['patient'].apply(lambda x: x['reference'].split('/')[-1])

  # payment

  # provider
  eob['providerId'] = eob['provider'].apply(process_provider)
  eob.drop('provider', axis=1, inplace=True)

  # supportingInfo
  temp = process_supportingInfo(eob['supportingInfo'])
  eob = pd.concat([eob, temp], axis=1)

  # total
  total = eob['total'].apply(lambda x: process_total(x))
  total = pd.DataFrame.from_records(total)
  eob = pd.concat([eob, total], axis=1)
  eob.drop('total', axis=1, inplace=True)

  # type
  ctype = eob['type'].apply(lambda x: process_type(x))
  ctype = pd.DataFrame.from_records(ctype)
  eob = pd.concat([eob, ctype], axis=1)
  eob.drop('type', axis=1, inplace=True)

  return eob

In [12]:
eob = eob_preprocessing(eob)
# eob_old = eob_preprocessing(eob_old)

In [None]:
# # Compare the set of IDs between old and new eob files
# print(f"Number of unique Patient IDs in old eob: {len(eob_old['patient'].unique())}")
# print(f"Number of unique Patient IDs in 2025-09-22 eob: {len(eob['patient'].unique())}")
# print()
# print(f"Overlapping patient IDs: {len(set(eob['patient']).intersection(set(eob_old['patient'])))}")

Number of unique Patient IDs in old eob: 50
Number of unique Patient IDs in 2025-09-22 eob: 50

Overlapping patient IDs: 0


In [None]:
# eob.to_pickle("../data/clean/2025-09-22/updated_eob.pkl")

In [13]:
%watermark

Last updated: 2025-09-05T22:30:42.787367+08:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 9.5.0

Compiler    : MSC v.1938 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 183 Stepping 1, GenuineIntel
CPU cores   : 20
Architecture: 64bit

