In [4]:
import pandas as pd
import numpy as np

In [5]:
dtype_obj = {
    'Health Service Area': str,
    'Hospital County': str,
    'Operating Certificate Number': str,
    'Facility ID': str,
    'Facility Name': str,
    'Age Group': str,
    'Zip Code - 3 digits': str,
    'Gender': str,
    'Race': str,
    'Ethnicity': str,
    'Length of Stay': int,
    'Type of Admission': str,
    'Patient Disposition': str,
    'Discharge Year': int,
    'CCS Diagnosis Code': str,
    'CCS Diagnosis Description': str,
    'CCS Procedure Code': str,
    'CCS Procedure Description': str,
    'APR DRG Code': str,
    'APR DRG Description': str,
    'APR MDC Code': str,
    'APR MDC Description': str,
    'APR Severity of Illness Code': str,
    'APR Severity of Illness Description': str,
    'APR Risk of Mortality': str,
    'APR Medical Surgical Description': str,
    'Source of Payment 1': str,
    'Source of Payment 2': str,
    'Source of Payment 3': str,
    'Attending Provider License Number': str,
    'Operating Provider License Number': str,
    'Other Provider License Number': str,
    'Birth Weight': float,
    'Abortion Edit Indicator': str,
    'Emergency Department Indicator': str,
    'Total Charges': float,
    'Total Costs': float,
    'year': int,
    'Hospital Service Area': str,
    'Permanent Facility Id': str,
    'Payment Typology 1': str,
    'Payment Typology 2': str,
    'Payment Typology 3': str
}

In [7]:
def changeIndexNames(key):
    return key.lower().replace(" ","_").replace("-","")
index_obj = {k: changeIndexNames(k) for k, v in dtype_obj.items()}

In [9]:
f2009 = pd.read_csv("../data/raw_data/CD_2009.csv")
f2010 = pd.read_csv("../data/raw_data/CD_2010.csv")
f2011 = pd.read_csv("../data/raw_data/CD_2011.csv")
f2012 = pd.read_csv("../data/raw_data/CD_2012.csv")
f2013 = pd.read_csv("../data/raw_data/CD_2013.csv")
f2014 = pd.read_csv("../data/raw_data/CD_2014.csv")
f2015 = pd.read_csv("../data/raw_data/CD_2015.csv")
f2016 = pd.read_csv("../data/raw_data/CD_2016.csv")

arry = [f2009,f2010,f2011,f2012,f2013,f2014,f2015,f2016]
start = 2009

In [10]:
for elem in arry:
    elem['year']=start
    start= start + 1

data = pd.concat(arry, sort=False)

data.shape

(53117, 45)

In [11]:
# Data Cleanup operations:
data.at[data['Length of Stay'] == '120 +', 'Length of Stay']=120
data=data.drop(['Unnamed: 37','Ratio of Total Costs to Total Charges'], axis=1)

In [12]:
data = data.astype(dtype_obj)

In [13]:
data = data.rename(columns=index_obj)

In [14]:
for col in list(data):
    if data[col].nunique() == 1:
        data=data.drop([col], axis=1)

## Data Manipulation Workflow

In [15]:
# APR DRG Codes of interest; sourced from https://www.health.ny.gov
# /facilities/hospital/reimbursement/apr-drg/weights
# /2018-07-01_final_weights.htm
diag_cardio = ['22','160','161','162','163','165','166','167','170','171','174','175','176','177',
               '190','191','192','193','194','196','198','200',' 201','204','205','206']
diag_circ = ['24','45','46','47','169','180','181','182','197','199','207','246','651','660','661',
             '662','663']
diag_canc = ['41','44','136','240','281','382','442','461','500','511','512519','530','690','691',
             '692','694','695','696']
diag_resp = ['90','93','120','121','130','131','132','133','134','139','140','141','142','143','144']
diag_diab = ['420']
diag_ment = ['740','750','751','752','753','754','755','756','758','759','760']
diag_drug = ['280','770','772','773','774','775','776']
diag_neuro = ['23','26','40','42','43','48','52','53','54','58']
diag_infect = ['49','50','51','80','113','137','138','244','248','249','463','531','710','711','720',
               '721','723','724','890','892','893','894']
diag_trauma = ['20','55','56','57','135','469','910','911','912','930']

In [16]:
#Data Processing variables
crd = data['apr_drg_code'].isin(diag_cardio)
crc = data['apr_drg_code'].isin(diag_circ)
cnc = data['apr_drg_code'].isin(diag_canc)
rsp = data['apr_drg_code'].isin(diag_resp)
dbt = data['apr_drg_code'].isin(diag_diab)
mnt = data['apr_drg_code'].isin(diag_ment)
drg = data['apr_drg_code'].isin(diag_drug)
nrl = data['apr_drg_code'].isin(diag_neuro)
inf = data['apr_drg_code'].isin(diag_infect)
trm = data['apr_drg_code'].isin(diag_trauma)

In [17]:
# Add a diagnosis type column
data['diagnosis']=numpy.where(crd,"Cardiology",
                  numpy.where(crc,"Circulatory",
                  numpy.where(cnc,"Cancer",
                  numpy.where(rsp,"Respiratory",
                  numpy.where(dbt,"Diabetes",
                  numpy.where(mnt,"Metal Health",
                  numpy.where(drg,"Drug Issue",
                  numpy.where(nrl,"Neurological",
                  numpy.where(inf,"Infection",
                  numpy.where(trm,"Trauma",""))))))))))

# Filter data with targeted diagnosis
f_data = data[data['diagnosis']!=""]

print("below are the filtered row,column dimensions of your data")
print(f_data.shape)

NameError: name 'numpy' is not defined