In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
import os
import json
from collections import OrderedDict
import pickle

## This section reads in the data

In [2]:
# Load datasets
path = '/work/'

explanation_of_benefit_file = path + 'ExplanationOfBenefit.ndjson'
coverage_file = path + 'Coverage.ndjson'
patient_file = path + 'Patient.ndjson'
claim_file = path + 'Claim.ndjson'
claim_response_file = path + 'ClaimResponse.ndjson'

with open(explanation_of_benefit_file, 'r') as f:
    explanation_of_benefit_df = pd.read_json(StringIO(f.read()), lines=True)

with open(coverage_file, 'r') as f:
    coverage_df = pd.read_json(StringIO(f.read()), lines=True)

with open(patient_file, 'r') as f:
    patient_df = pd.read_json(StringIO(f.read()), lines=True)

# with open(claim_file, 'r') as f:
#     claim_df = pd.read_json(StringIO(f.read()), lines=True)

with open(claim_response_file, 'r') as f:
     claim_response_df = pd.read_json(StringIO(f.read()), lines=True)

## This section flattens the files

In [3]:
def flatten_json(nested_json, prefix=''):
    """
    Recursively flattens a nested JSON object or dictionary into a single level.

    Notes:
        - Nested dictionaries and lists are flattened such that keys from deeper levels
          in the hierarchy are concatenated with underscores
        - Lists of dictionaries are handled by appending index numbers to the keys.
        - Non-dict lists are serialized using JSON encoding
        - Returns OrderedDict, a flattened version of the input json, where keys represent
          the nested structure and values are the corresponding data
    """
    out = OrderedDict()
    for key, value in nested_json.items():
        if isinstance(value, dict):
            # Recursively flatten nested dictionaries
            out.update(flatten_json(value, prefix + key + '_'))
        elif isinstance(value, list):
            if len(value) > 0:
                if isinstance(value[0], dict):
                    # Handle list of dictionaries by flattening each item
                    for i, item in enumerate(value):
                        out.update(flatten_json(item, prefix + key + '_' + str(i) + '_'))
                else:
                    # Non-dict lists are serialized into a JSON string
                    out[prefix + key] = json.dumps(value)
            else:
                # Empty lists are serialized as JSON strings
                out[prefix + key] = json.dumps(value)
        else:
            # Base case: key-value pair where value is not a list or dict
            out[prefix + key] = value
    return out

def process_dataframe(df):
    """
    Processes a pandas DataFrame by flattening any JSON-like data (dictionaries or lists)
    present in its columns and converting it into a new DataFrame.

    Notes:
        - The function iterates through each row and flattens any JSON-like data (dictionaries or lists)
        - Non-nested data is left unchanged
        - The resulting DataFrame will contain a combination of original columns and
          additional columns derived from the flattened structure
        - Returns a new pandas DataFrame with the flattened data
    """
    flattened_data = []
    for _, row in df.iterrows():
        flattened_row = {}
        for column, value in row.items():
            if isinstance(value, (dict, list)):
                # Flatten any dictionary or list
                flattened = flatten_json({column: value})
                flattened_row.update(flattened)
            else:
                # Keep non-nested columns unchanged
                flattened_row[column] = value
        flattened_data.append(flattened_row)
    return pd.DataFrame(flattened_data)


In [4]:
# Flatten explanation_of_benefit_df
flat_explanation_of_benefit_df = process_dataframe(explanation_of_benefit_df)

In [5]:
# Flatten Coverage
flat_coverage_df = process_dataframe(coverage_df)

In [6]:
# Flatten patient_df
flat_patient_df = process_dataframe(patient_df)

In [None]:
# Flatten claim_df
#flat_claim_df = process_dataframe(claim_df) - commented out at the moment due to the RAM issue

In [8]:
# Flatten claim_response_df
flat_claim_response_df = process_dataframe(claim_response_df)

## This Section Saves to Pickle

In [None]:
flat_explanation_of_benefit_df.to_pickle("explanation_of_benefit.pkl")
flat_coverage_df.to_pickle('coverage.pkl')
flat_patient_df.to_pickle('patient.pkl')
#flat_claim_df.to_pickle('claim_response.pkl')
flat_claim_response_df.to_pickle('claim_response.pkl')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cff59b50-5bfa-41ed-b840-afb0c6e184df' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>