In [4]:
import pandas as pd
import numpy as np

In [5]:
dtype_obj = {
    'health_service_area': str,
    'hospital_county': str,
    'operating_certificate_number': str,
    'facility_id': str,
    'facility_name': str,
    'age_group': str,
    'zip_code__3_digits': str,
    'gender': str,
    'race': str,
    'ethnicity': str,
    'length_of_stay': int,
    'type_of_admission': str,
    'patient_disposition': str,
    'discharge_year': int,
    'ccs_diagnosis_code': str,
    'ccs_diagnosis_description': str,
    'ccs_procedure_code': str,
    'ccs_procedure_description': str,
    'apr_drg_code': str,
    'apr_drg_description': str,
    'apr_mdc_code': str,
    'apr_mdc_description': str,
    'apr_severity_of_illness_code': float,
    'apr_severity_of_illness_description': str,
    'apr_risk_of_mortality': str,
    'apr_medical_surgical_description': str,
    'source_of_payment_1': str,
    'source_of_payment_2': str,
    'source_of_payment_3': str,
    'attending_provider_license_number': str,
    'operating_provider_license_number': str,
    'other_provider_license_number': str,
    'birth_weight': float,
    'abortion_edit_indicator': str,
    'emergency_department_indicator': str,
    'total_charges': float,
    'total_costs': float,
    'year': int,
    'hospital_service_area': str,
    'permanent_facility_id': str,
    'payment_typology_1': str,
    'payment_typology_2': str,
    'payment_typology_3': str,
    'diagnosis': str,
    'apr_risk_of_mortality_code': float
}

In [6]:
data = pd.read_csv("../data/enriched_data/CD_enriched.csv", index_col=0, dtype= dtype_obj)
f_data = pd.read_csv("../data/enriched_data/CD_filtered.csv", index_col=0, dtype= dtype_obj)

# Relative Frequency CSVs

In [7]:
year_labels = f_data['year'].unique()
diagnosis_categories = f_data['diagnosis'].unique()
for year in year_labels:
    year_data = f_data[f_data['year'] == year]
    print(year)
    print(year_data.shape)
    counts= pd.DataFrame(year_data['diagnosis'].value_counts(normalize=True).rename_axis('Diagnosis').reset_index(name='Relative Frequency'))
    counts.sort_values(by=['Diagnosis'], inplace=True)
    counts.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_relativefreq_{}.csv'.format(year))

2009
(4586, 42)
2010
(3628, 42)
2011
(3240, 42)
2012
(2401, 42)
2013
(2909, 42)
2014
(2927, 42)
2015
(3000, 42)
2016
(2686, 42)


In [8]:
year_labels = f_data['year'].unique()
diagnosis_categories = f_data['diagnosis'].unique()
aggregated_data_relativefreq=pd.DataFrame()
for year in year_labels:
    year_data = f_data[f_data['year'] == year]
    counts= pd.DataFrame(year_data['diagnosis'].value_counts(normalize=True).rename_axis('Diagnosis').reset_index(name='Relative Frequency'))
    counts['Year']= year
    aggregated_data_relativefreq= aggregated_data_relativefreq.append(counts)
    aggregated_data_relativefreq.sort_values(by=['Diagnosis'], inplace=True)
aggregated_data_relativefreq.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_relativefreq.csv')

## Summary Statistics by Diagnosis

In [50]:
def agg_ranking_score(group_by_col, df, ranking_start_index):
    counts = df.groupby(group_by_col).size()
    return counts.mul(np.arange(ranking_start_index,counts.size + ranking_start_index)).agg('sum') / (counts.agg('sum'))
def agg_metadata(df, diagnosis):
    if df.size == 0:
        return None
    else:
        row_count = df.size
        age_score = agg_ranking_score('age_group', df, 0)
        percent_female = df[df['gender'] == 'F'].size / row_count
        percent_white = df[df['race'] == 'White'].size / row_count
        APR_severity_score = agg_ranking_score('apr_severity_of_illness_code', df, 1)
        APR_risk_of_mortality_score = agg_ranking_score('apr_risk_of_mortality_code', df, 1)
        percent_emergency = df[df['type_of_admission'] == 'Emergency'].size / row_count

        return pd.DataFrame({
            'Age Score':[age_score],
            'Percent Female':[percent_female],
            'Percent Caucasian':[percent_white],
            'Avg APR Severity':[APR_severity_score],
            'Avg APR Mortality Risk':[APR_risk_of_mortality_score],
            'Percent Emergency Admission':[percent_emergency]
        }, index=[diagnosis])

In [65]:
year_labels = data['year'].unique()
diagnosis_categories = data['diagnosis'].unique()
aggregated_data_summary=pd.DataFrame()
aggregated_data_normalized=pd.DataFrame()
for year in year_labels:
    year_data = data[data['year'] == year]
    summary_df = agg_metadata(year_data, 'All Diagnoses')
    year_aggs = summary_df.iloc[0]
    for diagnosis in diagnosis_categories:
        diag_summary = agg_metadata(year_data[year_data['diagnosis'] == diagnosis], diagnosis)
        summary_df=pd.concat([summary_df, diag_summary])
        summary_df_transposed= summary_df.T
        summary_df_transposed['Year']= year
        normalized_summary_df = summary_df.apply(lambda row: row.div(year_aggs), axis=1)
        normalized_summary_df_transposed= normalized_summary_df.T
        normalized_summary_df_transposed['Year']= year
    aggregated_data_summary= aggregated_data_summary.append(summary_df_transposed)
    aggregated_data_normalized= aggregated_data_normalized.append(normalized_summary_df_transposed)
aggregated_data_summary.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_summary.csv')
aggregated_data_normalized.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_normalized.csv')

In [68]:
aggregated_data_summary.sort_index(inplace=True)
aggregated_data_summary

Unnamed: 0,All Diagnoses,Infection,Neurological,Metal Health,Respiratory,Drug Issue,Cardiology,Diabetes,Circulatory,Cancer,Trauma,Year
Age Score,2.505404,2.789079,2.082278,1.054902,2.853755,0.827515,3.52834,2.725,3.386431,1.650794,2.333333,2009
Age Score,2.488372,3.267606,3.037037,1.0112,3.244698,1.134831,2.477816,2.475,3.374269,1.563636,0.909091,2014
Age Score,2.538748,3.249075,2.142857,1.112701,3.285714,2.209302,2.4701,1.576923,2.216418,1.47619,1.285714,2015
Age Score,2.452516,3.197055,3.08,1.993197,3.189793,1.557377,2.595533,2.525,3.415385,2.38,2.083333,2013
Age Score,2.574291,3.042458,2.071429,1.004008,3.102732,1.478261,2.621399,2.825397,3.46,1.538462,0.285714,2010
Age Score,2.456812,3.113264,3.176471,0.924829,3.300158,0.92053,1.482283,1.863636,2.394161,0.526316,1.642857,2016
Age Score,2.517491,3.156499,3.030303,1.625,3.097812,1.568182,2.638655,1.551724,3.416031,1.636364,0.583333,2012
Age Score,2.537637,3.018587,3.147368,1.052758,3.148982,1.522388,2.609037,2.4125,3.393382,1.52,1.5,2011
Avg APR Mortality Risk,1.823391,2.674648,2.185185,1.0736,2.435563,1.573034,2.336177,1.825,2.269006,2.527273,2.636364,2014
Avg APR Mortality Risk,1.808445,2.576444,2.06,1.068027,2.291866,1.606557,2.468983,2.025,2.117949,2.7,2.083333,2013


In [69]:
aggregated_data_normalized.sort_index(inplace=True)
aggregated_data_normalized

Unnamed: 0,All Diagnoses,Infection,Neurological,Metal Health,Respiratory,Drug Issue,Cardiology,Diabetes,Circulatory,Cancer,Trauma,Year
Age Score,1.0,1.113225,0.831115,0.421051,1.13904,0.330292,1.408292,1.087649,1.351651,0.658893,0.93132,2009
Age Score,1.0,1.31315,1.220492,0.40637,1.303944,0.456054,0.995758,0.994626,1.356015,0.628377,0.365336,2014
Age Score,1.0,1.279794,0.844061,0.438287,1.294226,0.870233,0.97296,0.621142,0.873036,0.581464,0.506436,2015
Age Score,1.0,1.303582,1.255853,0.812715,1.300621,0.635012,1.058315,1.029555,1.392605,0.970432,0.849468,2013
Age Score,1.0,1.181863,0.80466,0.390013,1.205276,0.57424,1.0183,1.097544,1.344059,0.597625,0.110988,2010
Age Score,1.0,1.267196,1.292924,0.376435,1.343268,0.374685,0.603336,0.758559,0.974499,0.214227,0.668695,2016
Age Score,1.0,1.253827,1.2037,0.645484,1.230516,0.622915,1.048129,0.616377,1.356919,0.649998,0.231712,2012
Age Score,1.0,1.189527,1.240275,0.414857,1.240911,0.599923,1.028136,0.950687,1.337221,0.598982,0.591101,2011
Avg APR Mortality Risk,1.0,1.466854,1.198419,0.588793,1.335733,0.862697,1.281227,1.000883,1.244388,1.386029,1.445858,2014
Avg APR Mortality Risk,1.0,1.424673,1.1391,0.590578,1.267313,0.888364,1.365252,1.119746,1.171143,1.492995,1.152002,2013


In [83]:
df_split = np.array_split(aggregated_data_normalized, 6)
agg_age_score_data= df_split[0]
agg_age_score_data_sorted= agg_age_score_data.sort_values(by=['Year'])
agg_age_score_data_sorted.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_age_score.csv')

agg_mortality_risk_data= df_split[1]
agg_mortality_risk_data_sorted= agg_mortality_risk_data.sort_values(by=['Year'])
agg_mortality_risk_data_sorted.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_mortality_risk.csv')

agg_apr_severity_data= df_split[2]
agg_apr_severity_data_sorted= agg_apr_severity_data.sort_values(by=['Year'])
agg_apr_severity_data_sorted.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_apr_severity.csv')

agg_per_cauc_data= df_split[3]
agg_per_cauc_data_sorted= agg_per_cauc_data.sort_values(by=['Year'])
agg_per_cauc_data_sorted.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_percent_caucasian.csv')

agg_per_emergency_admission_data= df_split[4]
agg_per_emergency_admission_data_sorted= agg_per_emergency_admission_data.sort_values(by=['Year'])
agg_per_emergency_admission_data_sorted.to_csv('../data/enriched_data/Aggregated_Data/aggregated_percent_emergency_admission.csv')

agg_per_female_data= df_split[5]
agg_per_female_data_sorted= agg_per_female_data.sort_values(by=['Year'])
agg_per_female_data_sorted.to_csv('../data/enriched_data/Aggregated_Data/aggregated_data_percent_female.csv')



## Diagnoses with > 100 Cases

Unnamed: 0,All Diagnoses,Infection,Neurological,Metal Health,Respiratory,Drug Issue,Cardiology,Diabetes,Circulatory,Cancer,Trauma,Year
Age Score,1.0,1.113225,0.831115,0.421051,1.13904,0.330292,1.408292,1.087649,1.351651,0.658893,0.93132,2009
Age Score,1.0,1.31315,1.220492,0.40637,1.303944,0.456054,0.995758,0.994626,1.356015,0.628377,0.365336,2014
Age Score,1.0,1.279794,0.844061,0.438287,1.294226,0.870233,0.97296,0.621142,0.873036,0.581464,0.506436,2015
Age Score,1.0,1.303582,1.255853,0.812715,1.300621,0.635012,1.058315,1.029555,1.392605,0.970432,0.849468,2013
Age Score,1.0,1.181863,0.80466,0.390013,1.205276,0.57424,1.0183,1.097544,1.344059,0.597625,0.110988,2010
Age Score,1.0,1.267196,1.292924,0.376435,1.343268,0.374685,0.603336,0.758559,0.974499,0.214227,0.668695,2016
Age Score,1.0,1.253827,1.2037,0.645484,1.230516,0.622915,1.048129,0.616377,1.356919,0.649998,0.231712,2012
Age Score,1.0,1.189527,1.240275,0.414857,1.240911,0.599923,1.028136,0.950687,1.337221,0.598982,0.591101,2011


In [18]:
year_labels = data['year'].unique()
for year in year_labels:
    year_data = data[data['year'] == year]
    year_data_older= year_data[(year_data['age_group'] == '0 to 17') or (year_data['age_group'] == '18 to 29') or (year_data['age_group'] == '30 to 49')]
    print (year_data_older)                
    APR_counts = pd.DataFrame(year_data.groupby(['apr_drg_description', 'diagnosis']).size(), columns=['count'])
    freq_APRs = APR_counts[APR_counts['count'] > 100].sort_values(by=['count'], ascending=False)
    #freq_APRs.to_csv('../data/enriched_data/2019_style_analysis/frequent_diagnoses_{}.csv'.format(year))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## Severe Mortality Diagnosis Category Frequency

In [7]:
year_labels = data['year'].unique()
for year in year_labels:
    year_data = data[data['year'] == year]
    severe_APR_counts = year_data[np.logical_or(year_data['apr_risk_of_mortality_code'] > 2, year_data['apr_severity_of_illness_code'] > 2)]
    APR_counts = pd.DataFrame(severe_APR_counts.groupby(['apr_drg_description', 'diagnosis']).size(), columns=['count']).sort_values(by=['count'], ascending=False)
    APR_counts.to_csv('../data/enriched_data/2019_style_analysis/severe_diagnoses_{}.csv'.format(year))
    
