In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder

In [2]:
df_train = pd.read_csv('Data/train.csv')
df_test = pd.read_csv('Data/test.csv')

<h1>Explore data</h1>

In [52]:
df_train['readmitted_binary'] = df_train['readmitted_binary'].replace({'Yes': 1, 'No': 0}).astype(int)

In [47]:
df_train = pd.read_csv('Data/train.csv')
df_train.head()
#fill missing values with NaN
df_train.replace('?', np.nan, inplace = True)
df_train.replace('Not Mapped', np.nan, inplace = True)
df_train.replace('Not Available', np.nan, inplace = True)
#since we know that NaN is value for no provider let's replace it
df_train['payer_code'].fillna('No provider', inplace = True)

<h1>Medical Specialty

In [4]:
df_train['medical_specialty'].fillna('Not Available', inplace=True)

In [5]:
#repeat for test data
df_test['medical_specialty'].fillna('Not Available', inplace=True)

<h3> Target Encoding for Medical Specialty

In [48]:
df_train['medical_specialty'].value_counts()

InternalMedicine                        10292
Emergency/Trauma                         5319
Family/GeneralPractice                   5217
Cardiology                               3716
Surgery-General                          2144
Nephrology                               1136
Orthopedics                               954
Orthopedics-Reconstructive                867
Radiologist                               817
Pulmonology                               611
Psychiatry                                598
Urology                                   478
ObstetricsandGynecology                   468
Surgery-Cardiovascular/Thoracic           464
Gastroenterology                          397
Surgery-Vascular                          365
Surgery-Neuro                             320
PhysicalMedicineandRehabilitation         280
Oncology                                  243
Pediatrics                                186
Hematology/Oncology                       153
Neurology                         

In [57]:
#not working, the values are not correct

target_encoder = TargetEncoder(cols=['medical_specialty'])

# Assuming you have already defined target_encoder
df_train_encoded = target_encoder.fit_transform(df_train[['medical_specialty']], df_train['readmitted_binary'])

# Create medical_specialty_encoded column
df_train['medical_specialty_encoded'] = df_train_encoded['medical_specialty']


In [58]:

df_train['medical_specialty_encoded'].max()

0.2041342460327841

<h3>split into 3 or 4 groups


In [7]:
specialty_readmitted = df_train[['readmitted_binary', 'medical_specialty']].groupby(by='medical_specialty').mean()

lower_specialties = specialty_readmitted[(specialty_readmitted['readmitted_binary'] >= 0.0) & (specialty_readmitted['readmitted_binary'] < 0.05)]
mid_specialties = specialty_readmitted[(specialty_readmitted['readmitted_binary'] >= 0.05) & (specialty_readmitted['readmitted_binary'] < 0.2)]
higher_specialties = specialty_readmitted[(specialty_readmitted['readmitted_binary'] >= 0.2) & (specialty_readmitted['readmitted_binary'] < 0.5)]

df_train['lower_med_spec'] = df_train['medical_specialty'].apply(lambda x: 
                                                                      1 if (x in lower_specialties.index) else 0)


df_train['mid_specialties'] = df_train['medical_specialty'].apply(lambda x: 
                                                                      1 if (x in mid_specialties.index) else 0)


df_train['higher_specialties'] = df_train['medical_specialty'].apply(lambda x: 
                                                                      1 if (x in higher_specialties.index) else 0)




<h1>Average Pulse Bpm

In [8]:
cnt = df_train[['encounter_id','average_pulse_bpm']].groupby(by = 'average_pulse_bpm').count().sort_values(by = 'encounter_id', ascending=False)
mean = df_train[['readmitted_binary','average_pulse_bpm']].groupby(by = 'average_pulse_bpm').mean().sort_values(by = 'readmitted_binary', ascending=False)
pd.concat([cnt, mean], axis = 1).sort_values(by = 'readmitted_binary', ascending=False)


Unnamed: 0_level_0,encounter_id,readmitted_binary
average_pulse_bpm,Unnamed: 1_level_1,Unnamed: 2_level_1
135,870,0.136782
68,891,0.133558
91,855,0.132164
70,876,0.131279
92,908,0.131057
...,...,...
60,883,0.092865
78,845,0.089941
88,941,0.089267
105,913,0.088719


In [9]:
pd.isna(df_train['average_pulse_bpm']).value_counts() #no NaNs

False    71236
Name: average_pulse_bpm, dtype: int64

In [10]:
#the distribution does not seem normal, so let's use minmaxscaler
scaler = MinMaxScaler()
df_train['average_pulse_bpm_scaled'] = scaler.fit_transform(df_train[['average_pulse_bpm']])


In [11]:
#repeat for test data
scaler = MinMaxScaler()
df_test['average_pulse_bpm_scaled'] = scaler.fit_transform(df_test[['average_pulse_bpm']])

<h1>Discharge disposition

In [12]:
pd.set_option('display.max_rows', None)

df_train['discharge_disposition'].value_counts()

Discharged to home                                                                                           42256
Discharged/transferred to SNF                                                                                 9780
Discharged/transferred to home with home health service                                                       9005
Discharged/transferred to another short term hospital                                                         1488
Discharged/transferred to another rehab fac including rehab units of a hospital .                             1393
Expired                                                                                                       1135
Discharged/transferred to another type of inpatient care institution                                           822
Not Mapped                                                                                                     679
Discharged/transferred to ICF                                                   

In [13]:
pd.set_option('display.max_rows', None)

cnt = df_train[['encounter_id','discharge_disposition']].groupby(by = 'discharge_disposition').count().sort_values(by = 'encounter_id', ascending=False)
mean = df_train[['readmitted_binary','discharge_disposition']].groupby(by = 'discharge_disposition').mean().sort_values(by = 'readmitted_binary', ascending=False)
pd.concat([cnt, mean], axis = 1).sort_values(by = 'readmitted_binary', ascending=False)

Unnamed: 0_level_0,encounter_id,readmitted_binary
discharge_disposition,Unnamed: 1_level_1,Unnamed: 2_level_1
Admitted as an inpatient to this hospital,13,0.538462
Still patient or expected to return for outpatient services,2,0.5
Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital,98,0.397959
Discharged/transferred within this institution to Medicare approved swing bed,44,0.386364
Discharged/transferred to another rehab fac including rehab units of a hospital .,1393,0.278536
Discharged/transferred to another type of inpatient care institution,822,0.209246
Discharged/transferred to another short term hospital,1488,0.165323
Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.,32,0.15625
Left AMA,421,0.147268
Discharged/transferred to SNF,9780,0.146217


In [14]:
df_train['discharged_home_hospice'] = df_train['discharge_disposition'].apply(lambda x: 
                                                                              1 if (x == 'Discharged to home'
                                                                                   or x == 'Hospice / medical facility'
                                                                                   or x == 'Hospice / home'
                                                                                   or x == 'Discharged/transferred to a long term care hospital.')
                                                                              else 0)
df_train['expired'] = df_train['discharge_disposition'].apply(lambda x: 1 if (x == 'Expired'
                                                                        or x ==  'Expired at home. Medicaid only, hospice.')
                                                                        else 0)
df_train['transferred_rehub'] = df_train['discharge_disposition'].apply(lambda x: 
                                                                        1 if x == 'Discharged/transferred to another rehab fac including rehab units of a hospital' else 0)
df_train['transferred_another_institution'] = df_train['discharge_disposition'].apply(lambda x: 
                                                                      1 if x == 'Discharged/transferred to another type of inpatient care institution' else 0)
df_train['transferred_psychiatric_hospital'] = df_train['discharge_disposition'].apply(lambda x: 
                                                                      1 if x == 'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital	' else 0)

df_train['transfered_short_term_hospital'] = df_train['discharge_disposition'].apply(lambda x: 
                                                                      1 if x == 'Discharged/transferred to another short term hospital' else 0)

#group them through categories


<h1>Admission Source

In [15]:
df_train['admission_source'].value_counts()

 Emergency Room                                               40319
 Physician Referral                                           20678
Transfer from a hospital                                       2230
 Transfer from another health care facility                    1562
Clinic Referral                                                 779
 Transfer from a Skilled Nursing Facility (SNF)                 595
HMO Referral                                                    129
 Not Mapped                                                     107
 Not Available                                                   88
 Court/Law Enforcement                                           11
 Transfer from hospital inpt/same fac reslt in a sep claim        8
 Transfer from critial access hospital                            7
 Transfer from Ambulatory Surgery Center                          2
 Extramural Birth                                                 1
Normal Delivery                                 

In [16]:
cnt = df_train[['encounter_id','admission_source']].groupby(by = 'admission_source').count().sort_values(by = 'encounter_id', ascending=False)
mean = df_train[['readmitted_binary','admission_source']].groupby(by = 'admission_source').mean().sort_values(by = 'readmitted_binary', ascending=False)
pd.concat([cnt, mean], axis = 1).sort_values(by = 'readmitted_binary', ascending=False)


Unnamed: 0_level_0,encounter_id,readmitted_binary
admission_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Court/Law Enforcement,11,0.181818
HMO Referral,129,0.155039
Not Mapped,107,0.140187
Transfer from a Skilled Nursing Facility (SNF),595,0.127731
Transfer from hospital inpt/same fac reslt in a sep claim,8,0.125
Emergency Room,40319,0.11662
Clinic Referral,779,0.105263
Physician Referral,20678,0.105184
Transfer from another health care facility,1562,0.097311
Transfer from a hospital,2230,0.095964


In [17]:
df_train['admission_source'] = df_train['admission_source'].replace(' Not Mapped', ' Not Available')
#is this correct?

In [18]:
#eliminate spaces behind the text
df_train['admission_source'] = df_train['admission_source'].replace(' Emergency Room', 'Emergency Room')
df_train['admission_source'] = df_train['admission_source'].replace(' Physician Referral', 'Physician Referral')
df_train['admission_source'] = df_train['admission_source'].replace(' Transfer from another health care facility', 'Transfer from another health care facility')
df_train['admission_source'] = df_train['admission_source'].replace(' Court/Law Enforcement', 'Court/Law Enforcement')
df_train['admission_source'] = df_train['admission_source'].replace(' Transfer from hospital inpt/same fac reslt in a sep claim', 'Transfer from hospital inpt/same fac reslt in a sep claim')
df_train['admission_source'] = df_train['admission_source'].replace(' Transfer from critial access hospital', 'Transfer from critial access hospital')
df_train['admission_source'] = df_train['admission_source'].replace(' Transfer from Ambulatory Surgery Center', 'Transfer from Ambulatory Surgery Center')
df_train['admission_source'] = df_train['admission_source'].replace(' Extramural Birth', 'Extramural Birth')
df_train['admission_source'] = df_train['admission_source'].replace(' Sick Baby', 'Sick Baby')
df_train['admission_source'] = df_train['admission_source'].replace(' Transfer from a Skilled Nursing Facility (SNF)', 'Transfer from a Skilled Nursing Facility (SNF)')


df_train['admission_source'].apply(lambda x: x.strip())


AttributeError: 'float' object has no attribute 'strip'

In [None]:
df_train['admission_source'].value_counts()

Emergency Room                                               40319
Physician Referral                                           20678
Transfer from a hospital                                      2230
Transfer from another health care facility                    1562
Clinic Referral                                                779
Transfer from a Skilled Nursing Facility (SNF)                 595
 Not Available                                                 195
HMO Referral                                                   129
Court/Law Enforcement                                           11
Transfer from hospital inpt/same fac reslt in a sep claim        8
Transfer from critial access hospital                            7
Transfer from Ambulatory Surgery Center                          2
Extramural Birth                                                 1
Normal Delivery                                                  1
Sick Baby                                                     

In [None]:
cnt = df_train[['encounter_id','admission_source']].groupby(by = 'admission_source').count().sort_values(by = 'encounter_id', ascending=False)
mean = df_train[['readmitted_binary','admission_source']].groupby(by = 'admission_source').mean().sort_values(by = 'readmitted_binary', ascending=False)
pd.concat([cnt, mean], axis = 1).sort_values(by = 'readmitted_binary', ascending=False)

Unnamed: 0_level_0,encounter_id,readmitted_binary
admission_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Court/Law Enforcement,11,0.181818
HMO Referral,129,0.155039
Transfer from a Skilled Nursing Facility (SNF),595,0.127731
Transfer from hospital inpt/same fac reslt in a sep claim,8,0.125
Emergency Room,40319,0.11662
Not Available,195,0.112821
Clinic Referral,779,0.105263
Physician Referral,20678,0.105184
Transfer from another health care facility,1562,0.097311
Transfer from a hospital,2230,0.095964


In [None]:
#pick the most two popular ones and do one hot encoding
#the variable is not very helpful since we have almost all data concentrated into two categories that have no big correlation with the target variable. so we decided to just create one binary variable with the lowest


df_train['transfer_admission'] = df_train['admission_source'].apply(lambda x: 1 if (x == 'Transfer from a hospital	'
                                                                        or x ==  'Transfer from another health care facility')
                                                                        else 0)

<h1>Feature importance

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = df_train[['court_hmo_referral','transfer_admission', 'discharged_home_hospice','expired','transferred_rehub', 'transferred_another_institution','transferred_psychiatric_hospital','transfered_short_term_hospital', 'allergy_or_resident','no_medical_specialty', 'mid_medical_specialty', 'lower_medical_specialty','pediatrics_endocrinology','other_medical_specialty', 'average_pulse_bpm_scaled']]
y = df_train['readmitted_binary']

model = RandomForestClassifier()
model.fit(X, y)

feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
feature_importances

average_pulse_bpm_scaled            0.728903
discharged_home_hospice             0.113057
expired                             0.052311
mid_medical_specialty               0.021537
transferred_another_institution     0.020583
transfered_short_term_hospital      0.014929
transfer_admission                  0.012092
no_medical_specialty                0.009971
lower_medical_specialty             0.009379
court_hmo_referral                  0.007899
other_medical_specialty             0.004962
pediatrics_endocrinology            0.002564
allergy_or_resident                 0.001812
transferred_rehub                   0.000000
transferred_psychiatric_hospital    0.000000
dtype: float64