In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

<center><h2>Data Dictionary: Categorical Encodings</h2></center>

**admission\_type\_id **|**description**
:-----:|:-----:
1 |Emergency
2 |Urgent
3 |Elective
4 |Newborn
5 |Not Available
6 |NULL
7 |Trauma Center
8 |Not Mapped
 | 
**discharge\_disposition\_id** |**description**
1 |Discharged to home
2 |Discharged/transferred to another short term hospital
3 |Discharged/transferred to SNF
4 |Discharged/transferred to ICF
5 |Discharged/transferred to another type of inpatient care institution
6 |Discharged/transferred to home with home health service
7 |Left AMA
8 |Discharged/transferred to home under care of Home IV provider
9 |Admitted as an inpatient to this hospital
10 |Neonate discharged to another hospital for neonatal aftercare
11 |Expired
12 |Still patient or expected to return for outpatient services
13 |Hospice / home
14 |Hospice / medical facility
15 |Discharged/transferred within this institution to Medicare approved swing bed
16 |Discharged/transferred/referred another institution for outpatient services
17 |Discharged/transferred/referred to this institution for outpatient services
18 |NULL
19 |Expired at home. Medicaid only, hospice.
20 |Expired in a medical facility. Medicaid only, hospice.
21 |Expired, place unknown. Medicaid only, hospice.
22 |Discharged/transferred to another rehab fac including rehab units of a hospital .
23 |Discharged/transferred to a long term care hospital.
24 |Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.
25 |Not Mapped
26 |Unknown/Invalid
30 |Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere
27 |Discharged/transferred to a federal health care facility.
28 |Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital
29 |Discharged/transferred to a Critical Access Hospital (CAH).
 | 
**admission\_source\_id** |**description**
1 |Physician Referral
2 |Clinic Referral
3 |HMO Referral
4 |Transfer from a hospital
5 |Transfer from a Skilled Nursing Facility (SNF)
6 |Transfer from another health care facility
7 |Emergency Room
8 |Court/Law Enforcement
9 |Not Available
10 |Transfer from critial access hospital
11 |Normal Delivery
12 |Premature Delivery
13 |Sick Baby
14 |Extramural Birth
15 |Not Available
17 |NULL
18 |Transfer From Another Home Health Agency
19 |Readmission to Same Home Health Agency
20 |Not Mapped
21 |Unknown/Invalid
22 |Transfer from hospital inpt/same fac reslt in a sep claim
23 |Born inside this hospital
24 |Born outside this hospital
25 |Transfer from Ambulatory Surgery Center
26 |Transfer from Hospice
<br>
<br>
<center><h1>Steps Taken During Data Cleaning</h1></center>

-- (x) Convert `gender` values from strings to integers.  Male=0, Female=1.

-- (x) Convert `diabetesMed` values from strings to integers.  No=0, Yes=1

-- (x) Get list of column names for all categorical columns.  All medicine columns had the same possible values of `Down`, `No`, `Steady`, or `Up`.  Iterate through all columns and get unique values as a list. Sort the list.  If the list is equal to `['Down', 'No', 'Steady', 'Up']`, this is a medicine column, and needs to be one hot encoded.  Append column name to `one_hot_cols` array.  Also add `race` to this column.  When finished, create new one-hot encoded dataframe using `pd.get_dummies()` on the dataframe and list of column names.  

-- (X) Drop encounterID.  

-- (x) Change age category from current format to integer value.  E.G. Row 1, \[10-20) becomes 15.  

-- Deal with missing weight values 

-- Create correlation heatmap with labels

-- Create Correlation heatmap between variables

-- Decide if we should drop `patient_nbr` column.  If not, figure out how we are going to encode it, since the current format is a problem.  



In [2]:
# Read in data and remove labels for now.  Will add back in when dealing with null values.  

raw_df = pd.read_csv("diabetic_data.csv")

# Remove labels and store in a separate variable. Will add back in after one hot encoding step
labels = raw_df['readmitted']
raw_df.drop('readmitted', axis=1, inplace=True)

In [3]:
# Recode gender column, Female = 0, Male = 1
raw_df["gender"] = (raw_df["gender"].values == "Female" ).astype(np.uint8)

# Recode diabetesMed column, No = 0, Yes = 1
raw_df['diabetesMed'] = (raw_df["diabetesMed"].values == "Yes").astype(np.uint8)
raw_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,2278392,8222157,Caucasian,1,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,0
1,149190,55629189,Caucasian,1,[10-20),?,1,1,7,3,...,No,No,Up,No,No,No,No,No,Ch,1
2,64410,86047875,AfricanAmerican,1,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,No,1
3,500364,82442376,Caucasian,0,[30-40),?,1,1,7,2,...,No,No,Up,No,No,No,No,No,Ch,1
4,16680,42519267,Caucasian,0,[40-50),?,1,1,7,1,...,No,No,Steady,No,No,No,No,No,Ch,1
5,35754,82637451,Caucasian,0,[50-60),?,2,1,2,3,...,No,No,Steady,No,No,No,No,No,No,1
6,55842,84259809,Caucasian,0,[60-70),?,3,1,2,4,...,No,No,Steady,No,No,No,No,No,Ch,1
7,63768,114882984,Caucasian,0,[70-80),?,1,1,7,5,...,No,No,No,No,No,No,No,No,No,1
8,12522,48330783,Caucasian,1,[80-90),?,2,1,4,13,...,No,No,Steady,No,No,No,No,No,Ch,1
9,15738,63555939,Caucasian,1,[90-100),?,3,3,4,12,...,No,No,Steady,No,No,No,No,No,Ch,1


In [4]:
# Search for all medicine columns by checking the column's unique types.  Append any category name that fits

categorical_cols = ['race']
for col in raw_df.columns.values:
    target = ['Down', 'No', 'Steady', 'Up']
    actual = list(sorted(raw_df[col].unique()))
    if target == actual:
        categorical_cols.append(col)

# One hot encode the categorical columns using pd.get_dummies()

one_hot_df = pd.get_dummies(raw_df, columns=categorical_cols)
age_str_vector = list(sorted(one_hot_df['age']))
age_str_vector = age_str_vector[::-1]

In [5]:
# Create a vector of integers that contain the average of the age bounds for each row.  
age_int_vector = []
for row in age_str_vector:
    age = int(row[1] + '5')
    age_int_vector.append(age)

# Replace the age column with the new list containing corresponding integer values    
one_hot_df['age'] = age_int_vector


In [6]:
# Drop encounter_id column

one_hot_df.drop(['encounter_id'], axis=1, inplace=True)

In [17]:
# for column in one_hot_df.columns.values:
#     print("Column Name: {}  Data Type: {}".format(column, one_hot_df[column].dtype))
    
obj_columns = []
for col in one_hot_df.columns.values:
    if one_hot_df[col].dtype == "O":
        print("found one!")
        obj_columns.append(col)

print(obj_columns)

found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
found one!
['weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'acetohexamide', 'tolbutamide', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change']
