In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_obj = {
    'health_service_area': str,
    'hospital_county': str,
    'operating_certificate_number': str,
    'facility_id': str,
    'facility_name': str,
    'age_group': str,
    'zip_code__3_digits': str,
    'gender': str,
    'race': str,
    'ethnicity': str,
    'length_of_stay': int,
    'type_of_admission': str,
    'patient_disposition': str,
    'discharge_year': int,
    'ccs_diagnosis_code': str,
    'ccs_diagnosis_description': str,
    'ccs_procedure_code': str,
    'ccs_procedure_description': str,
    'apr_drg_code': str,
    'apr_drg_description': str,
    'apr_mdc_code': str,
    'apr_mdc_description': str,
    'apr_severity_of_illness_code': float,
    'apr_severity_of_illness_description': str,
    'apr_risk_of_mortality': str,
    'apr_medical_surgical_description': str,
    'source_of_payment_1': str,
    'source_of_payment_2': str,
    'source_of_payment_3': str,
    'attending_provider_license_number': str,
    'operating_provider_license_number': str,
    'other_provider_license_number': str,
    'birth_weight': float,
    'abortion_edit_indicator': str,
    'emergency_department_indicator': str,
    'total_charges': float,
    'total_costs': float,
    'year': int,
    'hospital_service_area': str,
    'permanent_facility_id': str,
    'payment_typology_1': str,
    'payment_typology_2': str,
    'payment_typology_3': str,
    'diagnosis': str,
    'apr_risk_of_mortality_code': float
}

In [3]:
data = pd.read_csv("../data/enriched_data/CD_enriched.csv", index_col=0, dtype= dtype_obj)
f_data = pd.read_csv("../data/enriched_data/CD_filtered.csv", index_col=0, dtype= dtype_obj)

## Summary Statistics by Diagnosis

In [4]:
def agg_ranking_score(group_by_col, df, ranking_start_index):
    counts = df.groupby(group_by_col).size()
    return counts.mul(np.arange(ranking_start_index,counts.size + ranking_start_index)).agg('sum') / (counts.agg('sum'))
def agg_metadata(df, diagnosis):
    if df.size == 0:
        return pd.DataFrame({
            'Age Score':[0],
            'Percent Female':[0],
            'Percent Caucasian':[0],
            'Avg APR Severity':[0],
            'Avg APR Mortality Risk':[0],
            'Percent Emergency Admission':[0]
        }, index=[diagnosis])
    else:
        row_count = df.size
        age_score = agg_ranking_score('age_group', df, 0)
        percent_female = df[df['gender'] == 'F'].size / row_count
        percent_white = df[df['race'] == 'White'].size / row_count
        APR_severity_score = agg_ranking_score('apr_severity_of_illness_code', df, 1)
        APR_risk_of_mortality_score = agg_ranking_score('apr_risk_of_mortality_code', df, 1)
        percent_emergency = df[df['type_of_admission'] == 'Emergency'].size / row_count

        return pd.DataFrame({
            'Age Score':[age_score],
            'Percent Female':[percent_female],
            'Percent Caucasian':[percent_white],
            'Avg APR Severity':[APR_severity_score],
            'Avg APR Mortality Risk':[APR_risk_of_mortality_score],
            'Percent Emergency Admission':[percent_emergency]
        }, index=[diagnosis])

In [5]:
year_labels = data['year'].unique()
diagnosis_categories = data['diagnosis'].unique()
for year in year_labels:
    year_data = data[data['year'] == year]
    summary_df = agg_metadata(year_data, 'All Diagnoses')
    year_aggs = summary_df.iloc[0]
    for diagnosis in diagnosis_categories:
        diag_summary = agg_metadata(year_data[year_data['diagnosis'] == diagnosis], diagnosis)
        summary_df=pd.concat([summary_df, diag_summary])
    summary_df.to_csv('../data/enriched_data/2019_style_analysis/raw_metadata_scores_{}.csv'.format(year))
    normalized_summary_df = summary_df.apply(lambda row: row.div(year_aggs), axis=1)
    normalized_summary_df.to_csv('../data/enriched_data/2019_style_analysis/normalized_metadata_scores_{}.csv'.format(year))

## Diagnoses with > 100 Cases

In [6]:
year_labels = data['year'].unique()
for year in year_labels:
    year_data = data[data['year'] == year]
    APR_counts = pd.DataFrame(year_data.groupby(['apr_drg_description', 'diagnosis']).size(), columns=['count'])
    freq_APRs = APR_counts[APR_counts['count'] > 100].sort_values(by=['count'], ascending=False)
    freq_APRs.to_csv('../data/enriched_data/2019_style_analysis/frequent_diagnoses_{}.csv'.format(year))

## Severe Mortality Diagnosis Category Frequency

In [7]:
data['diagnosis'].fillna('Other', inplace=True)
year_labels = data['year'].unique()
for year in year_labels:
    year_data = data[data['year'] == year]
    severe_APR_counts = year_data[np.logical_or(year_data['apr_risk_of_mortality_code'] > 2, year_data['apr_severity_of_illness_code'] > 2)]
    APR_counts = pd.DataFrame(severe_APR_counts.groupby(['apr_drg_description', 'diagnosis']).size(), columns=['count']).sort_values(by=['count'], ascending=False)
    APR_counts.to_csv('../data/enriched_data/2019_style_analysis/severe_diagnoses_{}.csv'.format(year))