In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_obj = {
    'Health Service Area': str,
    'Hospital County': str,
    'Operating Certificate Number': str,
    'Facility ID': str,
    'Facility Name': str,
    'Age Group': str,
    'Zip Code - 3 digits': str,
    'Gender': str,
    'Race': str,
    'Ethnicity': str,
    'Length of Stay': str,
    'Type of Admission': str,
    'Patient Disposition': str,
    'Discharge Year': int,
    'CCS Diagnosis Code': str,
    'CCS Diagnosis Description': str,
    'CCS Procedure Code': str,
    'CCS Procedure Description': str,
    'APR DRG Code': str,
    'APR DRG Description': str,
    'APR MDC Code': str,
    'APR MDC Description': str,
    'APR Severity of Illness Code': str,
    'APR Severity of Illness Description': str,
    'APR Risk of Mortality': str,
    'APR Medical Surgical Description': str,
    'Source of Payment 1': str,
    'Source of Payment 2': str,
    'Source of Payment 3': str,
    'Attending Provider License Number': str,
    'Operating Provider License Number': str,
    'Other Provider License Number': str,
    'Birth Weight': float,
    'Abortion Edit Indicator': str,
    'Emergency Department Indicator': str,
    'Total Charges': float,
    'Total Costs': float,
    'year': int,
    'Hospital Service Area': str,
    'Permanent Facility Id': str,
    'Payment Typology 1': str,
    'Payment Typology 2': str,
    'Payment Typology 3': str
}

In [3]:
def changeIndexNames(key):
    return key.lower().replace(" ","_").replace("-","")
index_obj = {k: changeIndexNames(k) for k, v in dtype_obj.items()}

In [4]:
def filterColumns(col):
    return col in dtype_obj.keys()

In [5]:
f2009 = pd.read_csv("../data/raw_data/CD_2009.csv", usecols=filterColumns, dtype=dtype_obj)
f2010 = pd.read_csv("../data/raw_data/CD_2010.csv", usecols=filterColumns, dtype=dtype_obj)
f2011 = pd.read_csv("../data/raw_data/CD_2011.csv", usecols=filterColumns, dtype=dtype_obj)
f2012 = pd.read_csv("../data/raw_data/CD_2012.csv", usecols=filterColumns, dtype=dtype_obj)
f2013 = pd.read_csv("../data/raw_data/CD_2013.csv", usecols=filterColumns, dtype=dtype_obj)
f2014 = pd.read_csv("../data/raw_data/CD_2014.csv", usecols=filterColumns, dtype=dtype_obj)
f2015 = pd.read_csv("../data/raw_data/CD_2015.csv", usecols=filterColumns, dtype=dtype_obj)
f2016 = pd.read_csv("../data/raw_data/CD_2016.csv", usecols=filterColumns, dtype=dtype_obj)

arry = [f2009,f2010,f2011,f2012,f2013,f2014,f2015,f2016]
start = 2009

In [6]:
for elem in arry:
    elem['year']=start
    start= start + 1

data = pd.concat(arry, sort=False)

data.shape

(53117, 43)

In [7]:
# Data Cleanup operations:
data.at[data['Length of Stay'] == '120 +', 'Length of Stay']=120
data = data.astype({'Length of Stay':int})

In [8]:
data = data.rename(columns=index_obj)

In [9]:
for col in list(data):
    if data[col].nunique() == 1:
        data=data.drop([col], axis=1)

## Data Manipulation Workflow

In [10]:
# APR DRG Codes of interest; sourced from https://www.health.ny.gov
# /facilities/hospital/reimbursement/apr-drg/weights
# /2018-07-01_final_weights.htm
diag_df = pd.read_csv('../data/APR_DRG_Diagnosis_Categories/APR_Codes.csv', index_col=0)

FileNotFoundError: [Errno 2] File b'../data/APR_DRG_Diagnosis_Categories/APR_Codes.csv' does not exist: b'../data/APR_DRG_Diagnosis_Categories/APR_Codes.csv'

In [11]:
diagnoses = {}
for key in diag_df.index:
    nparray = diag_df.loc[key].to_numpy()
    diagnoses[key] = nparray[np.logical_not(np.isnan(nparray))].astype(int).astype(str)
    print(key, diagnoses[key])

diag_cardio ['22' '160' '161' '162' '163' '165' '166' '167' '170' '171' '174' '175'
 '176' '177' '190' '191' '192' '193' '194' '196' '198' '200' '201' '204'
 '205' '206']
diag_circ ['24' '45' '46' '47' '169' '180' '181' '182' '197' '199' '207' '246' '651'
 '660' '661' '662' '663']
diag_canc ['41' '44' '136' '240' '281' '382' '442' '461' '500' '511' '512' '519'
 '530' '690' '691' '692' '694' '695' '696']
diag_resp ['90' '93' '120' '121' '130' '131' '132' '133' '134' '139' '140' '141'
 '142' '143' '144']
diag_diab ['420']
diag_ment ['740' '750' '751' '752' '753' '754' '755' '756' '758' '759' '760']
diag_drug ['280' '770' '772' '773' '774' '775' '776']
diag_neuro ['23' '26' '40' '42' '43' '48' '52' '53' '54' '58']
diag_infect ['49' '50' '51' '80' '113' '137' '138' '244' '248' '249' '463' '531' '710'
 '711' '720' '721' '723' '724' '890' '892' '893' '894']
diag_trauma ['20' '55' '56' '57' '135' '469' '910' '911' '912' '930']


In [12]:
#column definition functions
def diagnosis_category(el):
    if el in diagnoses['diag_cardio']:
        return 'Cardiology'
    elif el in diagnoses['diag_circ']:
        return 'Circulatory'
    elif el in diagnoses['diag_canc']:
        return 'Cancer'
    elif el in diagnoses['diag_resp']:
        return 'Respiratory'
    elif el in diagnoses['diag_diab']:
        return 'Diabetes'
    elif el in diagnoses['diag_ment']:
        return 'Metal Health'
    elif el in diagnoses['diag_drug']:
        return 'Drug Issue'
    elif el in diagnoses['diag_neuro']:
        return 'Neurological'
    elif el in diagnoses['diag_infect']:
        return 'Infection'
    elif el in diagnoses['diag_trauma']:
        return 'Trauma'
    else:
        return None

In [14]:
# Add a diagnosis type column
data['diagnosis']=data['apr_drg_code'].map(diagnosis_category)

#Add a APR Risk of Mortality column
data['apr_risk_of_mortality_code'] = data['apr_risk_of_mortality'].map({
    'Minor': 1,
    'Moderate': 2,
    'Major': 3,
    'Extreme':4
})

# Filter data with targeted diagnosis
f_data = data[data['diagnosis']!=""]

print("below are the filtered row,column dimensions of your data")
print(data.shape)

below are the filtered row,column dimensions of your data
(53117, 42)


In [15]:
data.to_csv("../data/enriched_data/CD_enriched.csv")

In [16]:
f_data.to_csv("../data/enriched_data/CD_filtered.csv")

In [17]:
f_data.dtypes

hospital_county                         object
operating_certificate_number            object
facility_id                             object
facility_name                           object
age_group                               object
zip_code__3_digits                      object
gender                                  object
race                                    object
ethnicity                               object
length_of_stay                           int32
type_of_admission                       object
patient_disposition                     object
discharge_year                           int32
ccs_diagnosis_code                      object
ccs_diagnosis_description               object
ccs_procedure_code                      object
ccs_procedure_description               object
apr_drg_code                            object
apr_drg_description                     object
apr_mdc_code                            object
apr_mdc_description                     object
apr_severity_