# Comparison of Random Actions vs. Learning

---

### Load Data

In [1]:
from load_data import load_data
from add_reward import add_reward_df, add_end_episode_df

In [2]:
df = load_data()
df = add_reward_df(df)
df = add_end_episode_df(df)

In [8]:
df = df.reset_index()

In [9]:
df

Unnamed: 0,index,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,hours,patient,zeros_reward,ones_reward,end_episode
0,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.370743,-0.929525,0,0,1,0.000000,-0.050000,0.0
1,1,0.742258,-0.680304,0.00000,-1.161539,-0.658462,0.000000,0.057829,0.0,0.000000,...,0.0,0.0,0.370743,-0.895669,0,1,1,0.000000,-0.050000,0.0
2,2,0.288984,0.645824,0.00000,-0.173836,-0.017464,0.000000,0.695652,0.0,0.000000,...,0.0,0.0,0.370743,-0.861813,0,2,1,0.000000,-0.050000,0.0
3,3,0.345643,-0.680304,0.00000,0.000000,0.000000,0.000000,2.396513,0.0,7.185176,...,0.0,0.0,0.370743,-0.827956,0,3,1,0.000000,-0.050000,0.0
4,4,1.082212,-2.835262,0.00000,-0.173836,0.302735,0.000000,1.227171,0.0,0.000000,...,0.0,0.0,0.370743,-0.794100,0,4,1,0.000000,-0.050000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631255,28,0.232325,0.314292,0.00000,0.361170,-0.317838,-0.156779,-0.579994,0.0,0.000000,...,0.0,0.0,0.370743,0.086161,1,28,20643,-0.888889,0.555556,0.0
631256,29,0.685598,0.314292,2.77665,1.966188,0.643360,0.411519,-0.579994,0.0,1.371991,...,0.0,0.0,0.370743,0.120017,1,29,20643,-1.111111,0.444444,0.0
631257,30,3.178601,-0.017240,0.00000,0.278862,-0.287801,-0.263335,-0.579994,0.0,0.000000,...,0.0,0.0,0.370743,0.153873,1,30,20643,-1.333333,0.333333,0.0
631258,31,2.045418,-0.348772,0.00000,1.143102,1.904932,2.755752,-0.579994,0.0,0.000000,...,0.0,0.0,0.370743,0.187729,1,31,20643,-1.555556,0.222222,0.0


In [10]:
patient_start_index = list(df.loc[df['index']==0].index)

In [12]:
len(patient_start_index)


16269

### Preprocessing

In [13]:
min_values = df.describe().loc['min']
max_values = df.describe().loc['max']

In [14]:
min_observation= min_values.loc[~min_values.index.isin(['SepsisLabel',
                                                        'patient', 
                                                        'zeros_reward',
                                                        'ones_reward'])].min()

max_observation = max_values.loc[~max_values.index.isin(['SepsisLabel',
                                                         'patient',
                                                         'zeros_reward',
                                                         'ones_reward'])].max()

In [15]:
min_observation

-25.545204389483803

In [16]:
max_observation

335.0

---

### Create Environment with OpenAI Gym

In [40]:
import random
import json
import gym
from gym import spaces
import pandas as pd
import numpy as np


class SepsisEnv(gym.Env):
    """A Sepsis environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}
    
    def __init__(self, df):
        super(SepsisEnv, self).__init__()

        self.df = df
        self.reward_range = (-2.0, 1.0)
        # index of where each patient begins
        self.patient_start_index = list(df.loc[df['index']==0].index)
        self.index = 0
        
        # Only two possible actions, 0 for non-sepsis,
        # 1 for sepsis
        n_actions = 2
        self.action_space = spaces.Discrete(n=n_actions)
        
        # Observation space is a feature vector of 41 vital signs, 
        # lab values, and other demographic information
        self.observation_space = spaces.Box(
            low=-25.545204389483803, high=335.0, shape=(1, 42), dtype=np.float16)

    def _next_observation(self):
        obs = np.array([
        self.df.loc[self.current_step, ~df.columns.isin(['SepsisLabel',
                            'patient', 
                            'zeros_reward',
                            'ones_reward',
                            'end_episode'])].values 
        ])

        return obs


    def step(self, action):
        # Execute one time step within the environment
        self.current_step += 1
        done = False
        if action == 0:
            reward = self.df.loc[self.current_step, ['zeros_reward']]
        else:
            reward = self.df.loc[self.current_step, ['ones_reward']]
        
        end_episode = self.df.loc[self.current_step, ['end_episode']]
        
        if end_episode.values == 1:
            done = True

        obs = self._next_observation()

        return obs, reward, done, {}

    
    def reset(self):
        # Reset the state of the environment to an initial state
        # Set the current step to a random point within the data frame

#         self.current_step = np.random.choice(patient_start_index)
        self.index += 1
        self.current_step = patient_start_index[self.index]

        return self._next_observation()

    def render(self, mode='human', close=False):
        # Render the environment to the screen
        print('current step' ,self.current_step)
        print('index', self.index)




In [99]:
import gym
import json
import datetime as dt
from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy
from stable_baselines.deepq import DQN
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
import pandas as pd

---

### Training: Multi-layer Perceptron Model 

In [85]:
env = DummyVecEnv([lambda: SepsisEnv(df)])
model = PPO2(MlpPolicy, env, verbose=0)
model.learn(total_timesteps=20_000)
reward_list = []
obs = env.reset()
done_count = 0

for i in range(5000): 
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    reward_list.append(rewards)
    if done:
        done_count += 1
        obs = env.reset()



In [86]:
done_count

140

In [87]:
sum(reward_list)

array([-70.316666], dtype=float32)

## MlpLstmPolicy

In [88]:
env = DummyVecEnv([lambda: SepsisEnv(df)])
model = PPO2(MlpLstmPolicy, env, nminibatches=1, verbose=0)
model.learn(total_timesteps=20_000)
reward_list = []
obs = env.reset()
done_count = 0
for i in range(5000): 
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    reward_list.append(rewards)
    if done:
        done_count += 1
        obs = env.reset()



In [89]:
done_count

140

In [90]:
sum(reward_list)

array([-72.14436], dtype=float32)

## MlpLnLstmPolicy

In [91]:
env = DummyVecEnv([lambda: SepsisEnv(df)])
model = PPO2(MlpLnLstmPolicy, env, nminibatches=1, verbose=0)
model.learn(total_timesteps=20_000)
reward_list = []
obs = env.reset()
done_count = 0
for i in range(5000): 
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    reward_list.append(rewards)
    if done:
        done_count += 1
        obs = env.reset()



In [92]:
done_count

140

In [93]:
sum(reward_list)

array([-70.65001], dtype=float32)

## DQN

In [101]:
from stable_baselines.deepq import DQN, MlpPolicy

In [102]:
env = DummyVecEnv([lambda: SepsisEnv(df)])
model = DQN(
    env=env,
    policy=MlpPolicy,
    learning_rate=1e-3,
    buffer_size=50000,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
)
model.learn(total_timesteps=20_000)
reward_list = []
obs = env.reset()
done_count = 0
for i in range(5000): 
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    reward_list.append(rewards)
    if done:
        done_count += 1
        obs = env.reset()









In [105]:
done_count

139

In [106]:
sum(reward_list)

array([-71.37203], dtype=float32)

#### 2) Random Model

In [124]:
reward_list = []
env = DummyVecEnv([lambda: SepsisEnv(df)])
obs = env.reset()
done_count = 0
for i in range(5000): 
    action = np.random.choice([0,1], size=1)
    obs, rewards, done, info = env.step(np.array([0]))
    reward_list.append(rewards)
    if done:
        done_count += 1
        obs = env.reset()


In [125]:
done_count

134

In [126]:
sum(reward_list)

array([-68.], dtype=float32)