In [260]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
import json_lines

In [261]:
# readin clean datafiles
path = "../data/clean/"
explanation_of_benefit_df = pd.read_pickle(path + 'explanation_of_benefit_clean.pkl')
coverage_df = pd.read_pickle(path + 'coverage.pkl')

In [None]:
# It seems that eob data does not have patient medicare number, but intsead just patient number
print(explanation_of_benefit_df.keys())
# patient number does not seem to be unique 
explanation_of_benefit_df.patient.describe()

Index(['careTeam', 'created', 'diagnosis', 'extension', 'id', 'insurance',
       'insurer', 'item', 'meta', 'outcome', 'patient', 'resourceType',
       'status', 'subType', 'supportingInfo', 'use', 'disposition',
       'procedure', 'benefitCategory', 'benefitFinancials',
       'billablePeriodStart', 'billablePeriodEnd', 'ClaimType', 'active',
       'PRNCode', 'NPICode', 'facilityId', 'faciltyType', 'claimId_1',
       'claimId_2', 'coveragePart', 'coverageId', 'lastUpdated',
       'paymentCurrency', 'paymentAmount', 'paymentDate', 'providerId',
       'totalChargeType', 'totalChargeCurrency', 'totalChargeAmount',
       'claimType', 'eobType'],
      dtype='object')


count                4091
unique                 50
top       -10000000003689
freq                  284
Name: patient, dtype: object

In [None]:
# readin raw datafiles
path = "../data/raw/"
#claim = pd.read_json(f"{path}/Claim.ndjson", lines=True)
claim_resp = pd.read_json(f"{path}/ClaimResponse.ndjson", lines=True)
patient = pd.read_json(f"{path}/Patient.ndjson", lines=True)

In [None]:
print(claim_resp.keys())
# claim response data has #patient for all patient numbers
claim_resp.patient.describe()

Index(['contained', 'created', 'extension', 'id', 'identifier', 'insurer',
       'meta', 'outcome', 'patient', 'request', 'resourceType', 'status',
       'type', 'use'],
      dtype='object')


In [272]:
# Extract birth date and Medicare Patient Number in Patient Data
patient_cols = dict((k, patient[k]) for k in ['birthDate', 'identifier'] if k in patient)
patient_cols_df = pd.DataFrame(patient_cols)
patient_cols_df['pat_med_num'] = patient_cols['identifier'].str.get(1).str.get('value')
patient_cols_df.head(2)

Unnamed: 0,birthDate,identifier,pat_med_num
0,1953-10-12,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00FR92
1,1946-03-01,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AK52


In [273]:
# Extract birth date and Medicare Patient Number in Claim Response Data
claim_resp_cols_df = pd.DataFrame(claim_resp['contained'])
claim_resp_cols_df['birthDate'] = claim_resp_cols_df['contained'].str.get(0).str.get('birthDate')
claim_resp_cols_df['pat_med_num'] = claim_resp_cols_df['contained'].str.get(0).str.get('identifier').str.get(0).str.get('value')
claim_resp_cols_df.head(2)

Unnamed: 0,contained,birthDate,pat_med_num
0,"[{'birthDate': '1944-05-25', 'extension': [{'u...",1944-05-25,1S00E00JK17
1,"[{'birthDate': '1944-05-25', 'extension': [{'u...",1944-05-25,1S00E00JK17


In [274]:
res = pd.merge(patient_cols_df, claim_resp_cols_df, how = 'outer', on = 'pat_med_num')
res.head(100).tail(10)

Unnamed: 0,birthDate_x,identifier,pat_med_num,contained,birthDate_y
90,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1950-08-12', 'extension': [{'u...",1950-08-12
91,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1950-08-12', 'extension': [{'u...",1950-08-12
92,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1950-08-12', 'extension': [{'u...",1950-08-12
93,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1955-03-20', 'extension': [{'u...",1955-03-20
94,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1955-03-20', 'extension': [{'u...",1955-03-20
95,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1955-03-20', 'extension': [{'u...",1955-03-20
96,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1955-03-20', 'extension': [{'u...",1955-03-20
97,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1955-03-20', 'extension': [{'u...",1955-03-20
98,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1955-03-20', 'extension': [{'u...",1955-03-20
99,1955-03-20,[{'system': 'https://bluebutton.cms.gov/resour...,1S00E00AA16,"[{'birthDate': '1955-03-20', 'extension': [{'u...",1955-03-20


In [237]:
pat_test_df = patient_cols_df[['birthDate', 'pat_med_num']]
claim_test_df = claim_resp_cols_df[['birthDate', 'pat_med_num']]

In [271]:
# Merge Patient and Claim Response dataframes together on Patient Medicare Number
res = pd.merge(pat_test_df, claim_test_df, how = 'outer', on = 'pat_med_num')
# there seems to be some data integrity issues as checking in the json this is the same patient [1S00E00AA16]
# but two different DOBs
res.head(100).tail(10)

Unnamed: 0,birthDate_x,pat_med_num,birthDate_y
90,1955-03-20,1S00E00AA16,1950-08-12
91,1955-03-20,1S00E00AA16,1950-08-12
92,1955-03-20,1S00E00AA16,1950-08-12
93,1955-03-20,1S00E00AA16,1955-03-20
94,1955-03-20,1S00E00AA16,1955-03-20
95,1955-03-20,1S00E00AA16,1955-03-20
96,1955-03-20,1S00E00AA16,1955-03-20
97,1955-03-20,1S00E00AA16,1955-03-20
98,1955-03-20,1S00E00AA16,1955-03-20
99,1955-03-20,1S00E00AA16,1955-03-20


In [270]:
res_un = res.drop_duplicates(subset= ['birthDate_x', 'pat_med_num', 'birthDate_y'], keep=False,inplace=False)