In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import json

In [39]:
df_cost_categorized = pd.read_csv("Hospital_Inpatient_Cost_Transparency_Categorized.csv")
df_cost_categorized["Categorization"].unique()

array(['Cardiac Conditions', 'Other', 'Cancer-Related Conditions',
       'Neurological Disorders', 'Trauma & Injury',
       'Respiratory Conditions', 'Digestive Disorders',
       'Musculoskeletal Disorders', 'Endocrine & Metabolic Disorders',
       'Delivery and Neonatal Procedures', 'Infectious Diseases'],
      dtype=object)

In [40]:
df_cost_categorized.head()


Unnamed: 0,Year,Facility Id,Facility Name,APR DRG Code,APR Severity of Illness Code,APR DRG Description,APR Severity of Illness Description,APR Medical Surgical Code,APR Medical Surgical Description,Discharges,Mean Charge,Median Charge,Mean Cost,Median Cost,Categorization
0,2016,4,Albany Memorial Hospital,194,1,Heart Failure,Minor,M,Medical,2,8375.41,8375.41,3585.05,3585.05,Cardiac Conditions
1,2016,4,Albany Memorial Hospital,194,2,Heart Failure,Moderate,M,Medical,40,14029.82,12176.95,6182.67,5253.15,Cardiac Conditions
2,2016,4,Albany Memorial Hospital,194,3,Heart Failure,Major,M,Medical,70,23921.77,20229.81,11149.49,9068.1,Cardiac Conditions
3,2016,4,Albany Memorial Hospital,194,4,Heart Failure,Extreme,M,Medical,12,51260.45,35210.82,26081.7,15230.62,Cardiac Conditions
4,2016,4,Albany Memorial Hospital,196,4,Cardiac Arrest,Extreme,M,Medical,1,25357.84,25357.84,7791.75,7791.75,Cardiac Conditions


In [44]:
def create_hospital_cost_dict(df):
    # First, calculate statistics across all hospitals for each condition and severity
    stats_by_condition = {}
    
    for condition in df['Categorization'].unique():
        condition_data = df[df['Categorization'] == condition]
        stats_by_condition[condition] = {}
        
        for severity in ['1', '2', '3', '4']:
            severity_data = condition_data[condition_data['APR Severity of Illness Code'] == int(severity)]
            
            if not severity_data.empty:
                costs = severity_data['Median Cost']
                mean = costs.mean()
                std = costs.std()
                
                stats_by_condition[condition][severity] = {
                    'mean': mean,
                    'std': std
                }
    
    # Now create the hospital dictionary with rankings
    hospitals = {}
    
    for facility_id in df['Facility Id'].unique():
        facility_data = df[df['Facility Id'] == facility_id]
        basic_info = facility_data.iloc[0]
        
        hospitals[float(facility_id)] = {
            'name': basic_info['Facility Name'],
            'costs_by_condition_severity': {}
        }
        
        for condition in facility_data['Categorization'].unique():
            condition_data = facility_data[facility_data['Categorization'] == condition]
            
            hospitals[facility_id]['costs_by_condition_severity'][condition] = {}
            
            for severity in ['1', '2', '3', '4']:
                severity_data = condition_data[
                    condition_data['APR Severity of Illness Code'] == int(severity)
                ]
                
                if not severity_data.empty:
                    median_cost = severity_data['Median Cost'].iloc[0]
                    stats = stats_by_condition[condition][severity]
                    
                    # Calculate z-score
                    z_score = (median_cost - stats['mean']) / stats['std'] if stats['std'] != 0 else 0
                    
                    # Determine ranking
                    if z_score < -1:
                        ranking = "significantly below average"
                    elif z_score < -0.5:
                        ranking = "below average"
                    elif z_score <= 0.5:
                        ranking = "average"
                    elif z_score <= 1:
                        ranking = "above average"
                    else:
                        ranking = "significantly above average"
                    
                    hospitals[facility_id]['costs_by_condition_severity'][condition][severity] = {
                        'cost': round(float(median_cost), 2),
                        'ranking': ranking,
                    }
                else:
                    hospitals[facility_id]['costs_by_condition_severity'][condition][severity] = {
                        'cost': None,
                        'ranking': None,
                        'z_score': None
                    }
    
    return hospitals

# Create the dictionary
hospital_costs = create_hospital_cost_dict(df_cost_categorized)

with open('hospital_costs.json', 'w') as f:
    json.dump(hospital_costs, f, indent=2)

print(hospital_costs)


{4.0: {'name': 'Albany Memorial Hospital', 'costs_by_condition_severity': {'Cardiac Conditions': {'1': {'cost': 3585.05, 'ranking': 'average'}, '2': {'cost': 5253.15, 'ranking': 'below average'}, '3': {'cost': 9068.1, 'ranking': 'average'}, '4': {'cost': 15230.62, 'ranking': 'average'}}, 'Other': {'1': {'cost': 7533.43, 'ranking': 'average'}, '2': {'cost': 12709.99, 'ranking': 'average'}, '3': {'cost': 3734.21, 'ranking': 'below average'}, '4': {'cost': 104436.29, 'ranking': 'significantly above average'}}, 'Neurological Disorders': {'1': {'cost': 3745.02, 'ranking': 'average'}, '2': {'cost': 7880.32, 'ranking': 'above average'}, '3': {'cost': 9293.42, 'ranking': 'average'}, '4': {'cost': 76964.49, 'ranking': 'significantly above average'}}, 'Cancer-Related Conditions': {'1': {'cost': 4818.69, 'ranking': 'average'}, '2': {'cost': 13312.8, 'ranking': 'above average'}, '3': {'cost': 6711.04, 'ranking': 'below average'}, '4': {'cost': 11535.19, 'ranking': 'below average'}}, 'Infectious Di