## Loading libraries and data

In [1]:
import numpy as np
import pandas as pd
import matplotlib
# import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
import tensorflow as tf

from keras.models import Sequential

from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU 
from tensorflow import feature_column

In [3]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder


In [4]:
raw_data = pd.read_csv('train.csv')


In [5]:
raw_data.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


## pandas profiling

In [52]:
from pandas_profiling import ProfileReport

In [53]:
raw_data[raw_data['Stay']=='0-10']

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
6,7,32,f,9,Y,1,radiotherapy,S,B,3.0,31397,7.0,Emergency,Extreme,2,51-60,6167.0,0-10
12,13,16,c,3,Z,2,radiotherapy,R,A,3.0,31397,7.0,Emergency,Extreme,2,51-60,5141.0,0-10
14,15,6,a,6,X,4,gynecology,Q,F,3.0,63418,8.0,Emergency,Extreme,2,71-80,2685.0,0-10
15,16,6,a,6,X,3,gynecology,Q,F,3.0,63418,8.0,Emergency,Extreme,2,71-80,9398.0,0-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318421,318422,19,a,7,Y,2,gynecology,S,C,2.0,70282,8.0,Emergency,Moderate,2,41-50,4277.0,0-10
318422,318423,22,g,9,Y,3,anesthesia,R,B,2.0,87939,23.0,Emergency,Minor,3,41-50,4510.0,0-10
318426,318427,17,e,1,X,3,gynecology,R,E,4.0,109647,32.0,Emergency,Minor,3,0-10,4185.0,0-10
318429,318430,3,c,3,Z,3,gynecology,S,A,3.0,14309,7.0,Urgent,Minor,2,51-60,3559.0,0-10


In [54]:
profile = ProfileReport(raw_data[raw_data['Stay']=='21-30'], title='Pandas Profiling Report', explorative=True)
profile

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=33.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






## Feature engineering

In [6]:
stay_list = list(set(raw_data['Stay']))
stay_list.sort()

In [7]:
def output_label(inputdf):
    final_output = []
    for x in inputdf:
        temp_list = 11*[0]
        temp_list[stay_list.index(x)] = 1
        final_output.append(temp_list)
    return pd.Series(final_output)

def output_label2(inputdf):
    return pd.Series([stay_list.index(x) for x in inputdf])

def output_label3(inputdf):
    encoder = LabelEncoder()
    encoder.fit(inputdf)
    encoded_Y = encoder.transform(inputdf)
    # convert integers to dummy variables (i.e. one hot encoded)
    return np_utils.to_categorical(encoded_Y)
    

In [8]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = output_label3(dataframe.pop('Stay'))
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [9]:
raw_data.rename(columns= {'Type of Admission': 'admission_type', 'Severity of Illness':'illness_severity', 'Available Extra Rooms in Hospital':'room_availability', 'Visitors with Patient':'visitors', 'Bed Grade':'bed'}, inplace=True)

In [10]:
## putting bed grade na as 0
raw_data['bed'] = raw_data['bed'].fillna(0)

In [11]:
raw_data = raw_data.drop(['case_id', 'patientid', 'City_Code_Patient', 'Admission_Deposit',
                           'City_Code_Hospital','Ward_Facility_Code'],axis = 1) 
raw_data.head(10)


Unnamed: 0,Hospital_code,Hospital_type_code,Hospital_region_code,room_availability,Department,Ward_Type,bed,admission_type,illness_severity,visitors,Age,Stay
0,8,c,Z,3,radiotherapy,R,2.0,Emergency,Extreme,2,51-60,0-10
1,2,c,Z,2,radiotherapy,S,2.0,Trauma,Extreme,2,51-60,41-50
2,10,e,X,2,anesthesia,S,2.0,Trauma,Extreme,2,51-60,31-40
3,26,b,Y,2,radiotherapy,R,2.0,Trauma,Extreme,2,51-60,41-50
4,26,b,Y,2,radiotherapy,S,2.0,Trauma,Extreme,2,51-60,41-50
5,23,a,X,2,anesthesia,S,2.0,Trauma,Extreme,2,51-60,11-20
6,32,f,Y,1,radiotherapy,S,3.0,Emergency,Extreme,2,51-60,0-10
7,23,a,X,4,radiotherapy,Q,3.0,Trauma,Extreme,2,51-60,41-50
8,1,d,Y,2,gynecology,R,4.0,Trauma,Extreme,2,51-60,51-60
9,10,e,X,2,gynecology,S,3.0,Trauma,Extreme,2,51-60,31-40


In [12]:
train, test = train_test_split(raw_data, test_size=0.2)
train, val = train_test_split(raw_data, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

254750 train examples
63688 validation examples
63688 test examples


In [13]:
raw_data.dtypes

Hospital_code             int64
Hospital_type_code       object
Hospital_region_code     object
room_availability         int64
Department               object
Ward_Type                object
bed                     float64
admission_type           object
illness_severity         object
visitors                  int64
Age                      object
Stay                     object
dtype: object

In [14]:
raw_data.columns

Index(['Hospital_code', 'Hospital_type_code', 'Hospital_region_code',
       'room_availability', 'Department', 'Ward_Type', 'bed', 'admission_type',
       'illness_severity', 'visitors', 'Age', 'Stay'],
      dtype='object')

### No missing values

In [15]:
raw_data.isna().sum()

Hospital_code           0
Hospital_type_code      0
Hospital_region_code    0
room_availability       0
Department              0
Ward_Type               0
bed                     0
admission_type          0
illness_severity        0
visitors                0
Age                     0
Stay                    0
dtype: int64

In [16]:
raw_data.corr()

Unnamed: 0,Hospital_code,room_availability,bed,visitors
Hospital_code,1.0,-0.059638,-0.012209,-0.0285
room_availability,-0.059638,1.0,-0.115969,0.096714
bed,-0.012209,-0.115969,1.0,0.088885
visitors,-0.0285,0.096714,0.088885,1.0


## features selection and enginnering

In [17]:
feature_columns = []

#,'Admission_Deposit'
# numeric cols
for header in ['room_availability']:
    feature_columns.append(feature_column.numeric_column(header))



In [18]:

# # category_columns
# indicator_column_names = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code',
#                           'Ward_Type', 'Ward_Facility_Code', 'illness_severity', 'Age']

indicator_column_names = ['visitors','Hospital_region_code','illness_severity', 'Age', 'Department', 'admission_type', 'Hospital_code','Hospital_type_code','Ward_Type']

for col_name in indicator_column_names:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, raw_data[col_name].unique())
    indicator_column = feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)


## Model

In [19]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [20]:
batch_size = 32
train_ds = df_to_dataset(raw_data, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [21]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(16, activation='relu'),
  layers.Dense(16, activation='relu'),
  layers.Dense(16, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(11, activation='softmax')
])



In [22]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.binary_crossentropy,
              metrics=['accuracy'])


In [23]:

model.fit(train_ds,
           validation_data=val_ds,
          epochs=100)

Epoch 1/100


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f50c2702ba8>

In [24]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.3995415270328522


In [164]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_7 (DenseFeatu multiple                  0         
_________________________________________________________________
dense_32 (Dense)             multiple                  1584      
_________________________________________________________________
dense_33 (Dense)             multiple                  272       
_________________________________________________________________
dense_34 (Dense)             multiple                  272       
_________________________________________________________________
dropout_8 (Dropout)          multiple                  0         
_________________________________________________________________
dense_35 (Dense)             multiple                  187       
Total params: 2,315
Trainable params: 2,315
Non-trainable params: 0
____________________________________________________

## predictions

In [210]:
test_data = pd.read_csv('test.csv')

test_ids = list(test_data['case_id'])
test_data.rename(columns= {'Type of Admission': 'admission_type', 'Severity of Illness':'illness_severity', 'Available Extra Rooms in Hospital':'room_availability', 'Visitors with Patient':'visitors', 'Bed Grade':'bed'}, inplace=True)
test_data = test_data.drop(['case_id', 'patientid', 'City_Code_Patient','Admission_Deposit',
                           'Ward_Facility_Code','City_Code_Hospital'],axis = 1) 



In [211]:
test_data['bed'].fillna(0)

0         2.0
1         2.0
2         4.0
3         2.0
4         2.0
         ... 
137052    3.0
137053    4.0
137054    4.0
137055    4.0
137056    4.0
Name: bed, Length: 137057, dtype: float64

In [212]:
len(set(test_data['Age']))

10

In [213]:
len(set(raw_data['Age']))

10

In [214]:


test_data = tf.data.Dataset.from_tensor_slices((dict(test_data)))
test_data = test_data.batch(batch_size)

In [215]:
output= model.predict(test_data)

In [216]:
temp_list = (output[0].tolist())

In [217]:
stay_list[temp_list.index(max(temp_list))]

'21-30'

In [218]:
output_pred = [stay_list[x.tolist().index(max(x.tolist()))] for x in output]

In [219]:
len(output_pred), len(test_ids)

(137057, 137057)

In [220]:
submission = {}
submission['case_id'] = test_ids
submission['Stay'] = output_pred

In [221]:
pd.DataFrame(submission).to_csv('submission05.csv', index=False)

## finding missing values

In [169]:
test_data = pd.read_csv('test.csv')

In [170]:
test_data.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              35
patientid                               0
City_Code_Patient                    2157
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
dtype: int64