In [0]:
!pip install keras-rl

In [0]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy.stats as sts
import seaborn as sns
import math
import rl

# Agents 

Classic Q-learning

In [0]:
from google.colab import drive
import os 

drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Ensemble RL")

from sarsa_agent import SARSAAgent
from reinforce import REINFORCE
from q_learning import CartPoleQAgent

In [0]:
"""
Base code taken from: 
https://github.com/IsaacPatole/CartPole-v0-using-Q-learning-SARSA-and-DNN/blob/master/Qlearning_for_cartpole.py
"""

class CartPoleQAgent():
    def __init__(self, buckets=(3, 3, 6, 6), 
                 num_episodes=500, min_lr=0.1, 
                 min_epsilon=0.1, discount=1.0, decay=25):
        self.buckets = buckets
        self.num_episodes = num_episodes
        self.min_lr = min_lr
        self.min_epsilon = min_epsilon
        self.discount = discount
        self.decay = decay

        self.env = gym.make('CartPole-v0')
        
        # This is the action-value function being initialized to 0's
        self.Q_table = np.zeros(self.buckets + (self.env.action_space.n,))

        # [position, velocity, angle, angular velocity]
        self.upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50) / 1.]
        self.lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50) / 1.]
        
        #
        self.steps = np.zeros(self.num_episodes)
        
        

    def discretize_state(self, obs):
        """
        Takes an observation of the environment and aliases it.
        By doing this, very similar observations can be treated
        as the same and it reduces the state space so that the 
        Q-table can be smaller and more easily filled.
        
        Input:
        obs (tuple): Tuple containing 4 floats describing the current
                     state of the environment.
        
        Output:
        discretized (tuple): Tuple containing 4 non-negative integers smaller 
                             than n where n is the number in the same position
                             in the buckets list.
        """
        discretized = list()
        for i in range(len(obs)):
            scaling = ((obs[i] + abs(self.lower_bounds[i])) 
                       / (self.upper_bounds[i] - self.lower_bounds[i]))
            new_obs = int(round((self.buckets[i] - 1) * scaling))
            new_obs = min(self.buckets[i] - 1, max(0, new_obs))
            discretized.append(new_obs)
        return tuple(discretized)

    def choose_action(self, state):
        """
        Implementation of e-greedy algorithm. Returns an action (0 or 1).
        
        
        Input:
        state (tuple): Tuple containing 4 non-negative integers within
                       the range of the buckets.
        
        Output:
        (int) Returns either 0 or 1
        """
        if (np.random.random() < self.epsilon):
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.Q_table[state])
        
    def get_action(self, state, e):
        """
        Another policy based on the Q-table. Slight variation from 
        e-greedy. It assumes the state fed hasn't been discretized and 
        returns a vector with probabilities for each action.
        
        Input: 
        state (tuple): Contains the 4 floats used to describe
                       the current state of the environment.
        e (int): Denotes the episode at which the agent is supposed
                 to be, helping balance exploration and exploitation.
                 
        Output:
        action_vector (numpy array): Vector containing the probability
                                     of each action being chosen at the
                                     current state.
        """
        obs = self.discretize_state(state)
        action_vector = self.Q_table[obs]
        epsilon = self.get_epsilon(e)
        action_vector = self.normalize(action_vector, epsilon)
        return action_vector

    def normalize(self, action_vector, epsilon):
        """
        Returns a vector with components adding to 1. Ensures 
        
        Input:
        action_vector (numpy array): Contains expected values for each
                                     action at current state from Q-table.
        epsilon (float): Chances that the e-greedy algorithm would 
                         choose an action at random. With this pol
        
        Output:
        new_vector (numpy array): Vector containing the probability
                                  of each action being chosen at the
                                  current state.
        """
        
        total = sum(action_vector)
        new_vector = (1-epsilon)*action_vector/(total)
        new_vector += epsilon/2.0
        return new_vector

    def update_q(self, state, action, reward, new_state):
        """
        Updates Q-table using the rule as described by Sutton and Barto in
        Reinforcement Learning.
        """
        self.Q_table[state][action] += self.learning_rate * (reward + self.discount * np.max(self.Q_table[new_state]) - self.Q_table[state][action])

    def get_epsilon(self, t):
        """Gets value for epsilon. It declines as we advance in episodes."""
        # Ensures that there's almost at least a min_epsilon chance of randomly exploring
        return max(self.min_epsilon, min(1., 1. - math.log10((t + 1) / self.decay)))

    def get_learning_rate(self, t):
        """Gets value for learning rate. It declines as we advance in episodes."""
        # Learning rate also declines as we add more episodes
        return max(self.min_lr, min(1., 1. - math.log10((t + 1) / self.decay)))

    def train(self):
        """
        Trains agent making it go through the environment and choose actions
        through an e-greedy policy and updating values for its Q-table. The 
        agent is trained by default for 500 episodes with a declining 
        learning rate and epsilon values that with the default values,
        reach the minimum after 198 episodes.
        """
        # Looping for each episode
        for e in range(self.num_episodes):
            # Initializes the state
            current_state = self.discretize_state(self.env.reset())

            self.learning_rate = self.get_learning_rate(e)
            self.epsilon = self.get_epsilon(e)
            done = False
            
            # Looping for each step
            while not done:
                self.steps[e] += 1
                # Choose A from S
                action = self.choose_action(current_state)
                # Take action
                obs, reward, done, _ = self.env.step(action)
                new_state = self.discretize_state(obs)
                # Update Q(S,A)
                self.update_q(current_state, action, reward, new_state)
                current_state = new_state
                
                # We break out of the loop when done is False which is
                # a terminal state.

        print('Finished training!')
    
    def plot_learning(self):
        """
        Plots the number of steps at each episode and prints the
        amount of times that an episode was successfully completed.
        """
        sns.lineplot(range(len(self.steps)),self.steps)
        plt.xlabel("Episode")
        plt.ylabel("Steps")
        plt.show()
        t = 0
        for i in range(self.num_episodes):
            if self.steps[i] == 200:
                t+=1
        print(t, "episodes were successfully completed.")
        

    def run(self):
        """Runs an episode while displaying the cartpole environment."""
        self.env = gym.wrappers.Monitor(self.env,'cartpole')
        t = 0
        done = False
        current_state = self.discretize_state(self.env.reset())
        while not done:
                self.env.render()
                t = t+1
                action = self.choose_action(current_state)
                obs, reward, done, _ = self.env.step(action)
                new_state = self.discretize_state(obs)
                current_state = new_state
            
        return t   


def load_q_learning():
    agent = CartPoleQAgent()
    steps = agent.train()

    return agent

q_agent = load_q_learning()

Deep REINFORCE

In [0]:
## Config ##
import tensorflow as tf
ENV="CartPole-v1"
RANDOM_SEED=1
N_EPISODES=500

# random seed (reproduciblity)
np.random.seed(RANDOM_SEED)
# tf.random.set_seed(RANDOM_SEED)

# set the env
env=gym.make(ENV) # env to import
env.seed(RANDOM_SEED)
env.reset() # reset to env 

class REINFORCE:
  def __init__(self, env, path=None):
    self.env=env #import env
    self.state_shape=env.observation_space.shape # the state space
    self.action_shape=env.action_space.n # the action space
    self.gamma=0.99 # decay rate of past observations
    self.alpha=1e-4 # learning rate in the policy gradient
    self.learning_rate=0.01 # learning rate in deep learning
    
    if not path:
      self.model=self._create_model() #build model
    else:
      self.model=self.load_model(path) #import model

    # record observations
    self.states=[]
    self.gradients=[] 
    self.rewards=[]
    self.probs=[]
    self.discounted_rewards=[]
    self.total_rewards=[]
  
  def _create_model(self):
    ''' builds the model using keras'''
    model=Sequential()

    # input shape is of observations
    model.add(Dense(24, input_shape=self.state_shape, activation="relu"))
    #model.add(Dropout(0.5))
    # introduce a relu layer 
    model.add(Dense(12, activation="relu"))
    #model.add(Dropout(0.5))    

    # output shape is according to the number of action
    # The softmax function outputs a probability distribution over the actions
    model.add(Dense(self.action_shape, activation="softmax")) 
    model.compile(loss="categorical_crossentropy",
            optimizer=Adam(lr=self.learning_rate))
        
    return model

  def hot_encode_action(self, action):
    '''encoding the actions into a binary list'''

    action_encoded=np.zeros(self.action_shape, np.float32)
    action_encoded[action]=1

    return action_encoded
  
  def remember(self, state, action, action_prob, reward):
    '''stores observations'''
    encoded_action=self.hot_encode_action(action)
    self.gradients.append(encoded_action-action_prob)
    self.states.append(state)
    self.rewards.append(reward)
    self.probs.append(action_prob)

  
  def get_action(self, state):
    '''samples the next action based on the policy probabilty distribution 
      of the actions'''

    # transform state
    state=state.reshape([1, state.shape[0]])
    # get action probably
    action_probability_distribution=self.model.predict(state).flatten()
    # norm action probability distribution
    action_probability_distribution/=np.sum(action_probability_distribution)
    
    # sample action
    action=np.random.choice(self.action_shape,1,
                            p=action_probability_distribution)[0]

    return action, action_probability_distribution


  def get_discounted_rewards(self, rewards): 
    '''Use gamma to calculate the total reward discounting for rewards
    Following - \gamma ^ t * Gt'''
    
    discounted_rewards=[]
    cumulative_total_return=0
    # iterate the rewards backwards and and calc the total return 
    for reward in rewards[::-1]:      
      cumulative_total_return=(cumulative_total_return*self.gamma)+reward
      discounted_rewards.insert(0, cumulative_total_return)

    # normalize discounted rewards
    mean_rewards=np.mean(discounted_rewards)
    std_rewards=np.std(discounted_rewards)
    norm_discounted_rewards=(discounted_rewards-
                          mean_rewards)/(std_rewards+1e-7) # avoiding zero div
    
    return norm_discounted_rewards


  def update_policy(self):
    '''Updates the policy network using the NN model.
    This function is used after the MC sampling is done - following
    \delta \theta = \alpha * gradient + log pi'''
      
    # get X
    states=np.vstack(self.states)

    # get Y
    gradients=np.vstack(self.gradients)
    rewards=np.vstack(self.rewards)
    discounted_rewards=self.get_discounted_rewards(rewards)
    gradients*=discounted_rewards
    gradients=self.alpha*np.vstack([gradients])+self.probs

    history=self.model.train_on_batch(states, gradients)
    
    self.states, self.probs, self.gradients, self.rewards=[], [], [], []

    return history



  def train(self, episodes, rollout_n=1, render_n=50):
    '''train the model
        episodes - number of training iterations 
        rollout_n- number of episodes between policy update
        render_n - number of episodes between env rendering ''' 
    
    env=self.env
    total_rewards=np.zeros(episodes)

    for episode in range(episodes):
      # each episode is a new game env
      state=env.reset()
      done=False          
      episode_reward=0 #record episode reward
      
      while not done:
        # play an action and record the game state & reward per episode
        action, prob=self.get_action(state)
        next_state, reward, done, _=env.step(action)
        self.remember(state, action, prob, reward)
        state=next_state
        episode_reward+=reward

        #if episode%render_n==0: ## render env to visualize.
          #env.render()
        if done:
          # update policy 
          if episode%rollout_n==0:
            history=self.update_policy()

      total_rewards[episode]=episode_reward
      
    self.total_rewards=total_rewards

  def save_model(self):
    '''saves the moodel // do after training'''
    self.model.save('REINFORCE_model.h5')
  
  def load_model(self, path):
    '''loads a trained model from path'''
    return load_model(path)

reinforce_agent=REINFORCE(env)
reinforce_agent.load_model("./model.h5") #Available to download here: https://drive.google.com/open?id=16MYB_Hy_gdVlGn-ianKduIYOuU1BjK9L

Deep Sarsa

In [0]:
import gym
%tensorflow_version 1.14
from keras.layers import Dense, Flatten, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy
from rl.callbacks import TrainEpisodeLogger

import collections

from keras.callbacks import History
from keras.models import Model
from keras.layers import Input, Lambda
import keras.backend as K

from rl.core import Agent
from rl.agents.dqn import mean_q
from rl.util import huber_loss
from rl.policy import EpsGreedyQPolicy, GreedyQPolicy
from rl.util import get_object_config


class SARSAAgent(Agent):
    """This class defines the SARSA agent
    """
    def __init__(self, model, nb_actions, policy, test_policy=EpsGreedyQPolicy(), gamma=.9, nb_steps_warmup=10,
                 train_interval=1, delta_clip=np.inf, *args, **kwargs):
        super(SarsaAgent, self).__init__(*args, **kwargs)

        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()

        self.model = model
        self.nb_actions = nb_actions
        self.policy = policy
        self.test_policy = test_policy
        self.gamma = gamma
        self.nb_steps_warmup = nb_steps_warmup
        self.train_interval = train_interval

        self.delta_clip = delta_clip
        self.compiled = False
        self.actions = None
        self.observations = None
        self.rewards = None
        self.q_values=[]

    def compute_batch_q_values(self, state_batch):
        batch = self.process_state_batch(state_batch)
        q_values = self.model.predict_on_batch(batch)
        assert q_values.shape == (len(state_batch), self.nb_actions)
        return q_values

    def compute_q_values(self, state):
        q_values = self.compute_batch_q_values([state]).flatten()
        assert q_values.shape == (self.nb_actions,)
        return q_values

    def process_state_batch(self, batch):
        batch = np.array(batch)
        if self.processor is None:
            return batch
        return self.processor.process_state_batch(batch)

    def get_config(self):
        config = super(SarsaAgent, self).get_config()
        config['nb_actions'] = self.nb_actions
        config['gamma'] = self.gamma
        config['nb_steps_warmup'] = self.nb_steps_warmup
        config['train_interval'] = self.train_interval
        config['delta_clip'] = self.delta_clip
        config['model'] = get_object_config(self.model)
        config['policy'] = get_object_config(self.policy)
        config['test_policy'] = get_object_config(self.test_policy)
        return config

    def compile(self, optimizer, metrics=[]):
        metrics += [mean_q]  

        def clipped_masked_error(args):
            y_true, y_pred, mask = args
            loss = huber_loss(y_true, y_pred, self.delta_clip)
            loss *= mask  
            return K.sum(loss, axis=-1)

        # Create trainable model. The problem is that we need to mask the output since we only
        # ever want to update the Q values for a certain action. The way we achieve this is by
        # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
        # to mask out certain parameters by passing in multiple inputs to the Lambda layer.
        y_pred = self.model.output
        y_true = Input(name='y_true', shape=(self.nb_actions,))
        mask = Input(name='mask', shape=(self.nb_actions,))
        loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask])
        ins = [self.model.input] if type(self.model.input) is not list else self.model.input
        trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred])
        assert len(trainable_model.output_names) == 2
        combined_metrics = {trainable_model.output_names[1]: metrics}
        losses = [
            lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
            lambda y_true, y_pred: K.zeros_like(y_pred),  # we only include this for the metrics
        ]
        trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics)
        self.trainable_model = trainable_model

        self.compiled = True

    def load_weights(self, filepath):
        self.model.load_weights(filepath)

    def save_weights(self, filepath, overwrite=False):
        self.model.save_weights(filepath, overwrite=overwrite)

    def reset_states(self):
        self.actions = collections.deque(maxlen=2)
        self.observations = collections.deque(maxlen=2)
        self.rewards = collections.deque(maxlen=2)
        if self.compiled:
            self.model.reset_states()

    def forward(self, observation):
        # Select an action.
        q_values = self.compute_q_values([observation])
        if self.training:
            action = self.policy.select_action(q_values=q_values)
        else:
            action = self.test_policy.select_action(q_values=q_values)

        # Book-keeping.
        self.observations.append(observation)
        self.actions.append(action)
        # self.q_values=q_values

        return action

    def backward(self, reward, terminal):
        metrics = [np.nan for _ in self.metrics_names]
        if not self.training:
            # We're done here. No need to update the experience memory since we only use the working
            # memory to obtain the state over the most recent observations.
            return metrics

        # Train the network on a single stochastic batch.
        if self.step > self.nb_steps_warmup and self.step % self.train_interval == 0:
            # Start by extracting the necessary parameters (we use a vectorized implementation).
            self.rewards.append(reward)
            if len(self.observations) < 2:
                return metrics  # not enough data yet

            state0_batch = [self.observations[0]]
            reward_batch = [self.rewards[0]]
            action_batch = [self.actions[0]]
            terminal1_batch = [0.] if terminal else [1.]
            state1_batch = [self.observations[1]]
            action1_batch = [self.actions[1]]

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            assert reward_batch.shape == (1,)
            assert terminal1_batch.shape == reward_batch.shape
            assert len(action_batch) == len(reward_batch)

            batch = self.process_state_batch(state1_batch)
            q_values = self.compute_q_values(batch)
            q_values = q_values.reshape((1, self.nb_actions))
            probs=q_values[0]
            probs/=np.sum(probs)
            self.q_values.append(probs)
            # self.q_values/=np.sum(q_values)

            q_batch = q_values[0, action1_batch]

            assert q_batch.shape == (1,)
            targets = np.zeros((1, self.nb_actions))
            dummy_targets = np.zeros((1,))
            masks = np.zeros((1, self.nb_actions))

            # Compute r_t + gamma * Q(s_t+1, a_t+1)
            discounted_reward_batch = self.gamma * q_batch
            # Set discounted reward to zero for all states that were terminal.
            discounted_reward_batch *= terminal1_batch
            assert discounted_reward_batch.shape == reward_batch.shape
            Rs = reward_batch + discounted_reward_batch
            for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)):
                target[action] = R  # update action with estimated accumulated reward
                dummy_targets[idx] = R
                mask[action] = 1.  # enable loss for this specific action
            targets = np.array(targets).astype('float32')
            masks = np.array(masks).astype('float32')

            # Finally, perform a single update on the entire batch. We use a dummy target since
            # the actual loss is computed in a Lambda layer that needs more complex input. However,
            # it is still useful to know the actual target to compute metrics properly.
            state0_batch = state0_batch.reshape((1,) + state0_batch.shape)
            ins = [state0_batch] if type(self.model.input) is not list else state0_batch
            metrics = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets])
            metrics = [metric for idx, metric in enumerate(metrics) if idx not in (1, 2)]  # throw away individual losses
            metrics += self.policy.metrics
            if self.processor is not None:
                metrics += self.processor.metrics
        return metrics

    @property
    def layers(self):
        return self.model.layers[:]

    @property
    def metrics_names(self):
        # Throw away individual losses and replace output name since this is hidden from the user.
        assert len(self.trainable_model.output_names) == 2
        dummy_output_name = self.trainable_model.output_names[1]
        model_metrics = [name for idx, name in enumerate(self.trainable_model.metrics_names) if idx not in (1, 2)]
        model_metrics = [name.replace(dummy_output_name + '_', '') for name in model_metrics]

        names = model_metrics + self.policy.metrics_names[:]
        if self.processor is not None:
            names += self.processor.metrics_names[:]
        return names

    @property
    def policy(self):
        return self.__policy

    @policy.setter
    def policy(self, policy):
        self.__policy = policy
        self.__policy._set_agent(self)

    @property
    def test_policy(self):
        return self.__test_policy

    @test_policy.setter
    def test_policy(self, policy):
        self.__test_policy = policy
        self.__test_policy._set_agent(self)

# Aliases
SarsaAgent = SARSAAgent

env = gym.make('CartPole-v1')
seed_val = 456
env.seed(seed_val)
np.random.seed(seed_val)

#Getting the state and action space
states = env.observation_space.shape[0]
actions = env.action_space.n

#Defining a Neural Network function for our Cartpole agent 
def agent(states, actions):
    """Creating a simple Deep Neural Network."""
    model = Sequential()
    model.add(Flatten(input_shape = (1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

#Getting our neural network
model = agent(states, actions)
#Defining SARSA Keras-RL agent: inputing the policy and the model
sarsa = SARSAAgent(model=model, nb_actions=actions, policy=EpsGreedyQPolicy())
#Compiling SARSA with mean squared error loss
sarsa.compile('adam', metrics=["mse"])

#Training the agent for 50000 steps
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1)

# Ensembling Method

In [0]:
def majority_vote(p1, p2, p3):
    '''
    Takes three different probability vectors in and outputs a randomly sampled 
    action from n_action according to majority voting scheme
    '''
    a = range(n_action)
    a1 = np.random.choice(a=a, p=p1)
    a2 = np.random.choice(a=a, p=p2)
    a3 = np.random.choice(a=a, p=p3)
    l = [a1, a2, a3]
    return max(set(l), key=l.count)

def average_prob(p1, p2, p3):
    '''
    Takes three different probability vectors in and outputs a randomly sampled 
    action from n_action with probability equals the average probability of the
    input vectors
    '''
    a = range(n_action)
    p = (p1 + p2 + p3)/3
    p = p/np.sum(p)
    a = np.random.choice(a=a, p=p)
    return a

def boltzmann_prob(p1, p2, p3, T=0.5):
    '''
    Takes three different probability vectors in and outputs a randomly sampled 
    action from n_action with probability equals the average probability of the 
    normalized exponentiated input vectors, with a temperature T controlling
    the degree of spread for the out vector
    '''
    a = range(n_action)
    boltz_ps = [np.exp(prob/T)/sum(np.exp(prob/T)) for prob in [p1, p2, p3]]
    p = (boltz_ps[0] + boltz_ps[1] + boltz_ps[2])/3
    p = p/np.sum(p)
    a = np.random.choice(a=a, p=p)
    return a

In [0]:
n_action = 2
def ensembler_play(learners, env, episodes, vote="majority_vote"):
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      _, p = learners[0].get_action(state)
      ps.append(p)
      p = learners[1].get_action(state, 500)
      ps.append((p + np.max(p) + 1)/np.sum(p + np.max(p) + 1))
      q_values = learners[2].compute_q_values(state.reshape(1, 4))
      q_values = q_values.reshape((1, 2))
      probs=q_values[0]
      probs/=np.sum(probs)
      ps.append(probs)
      # print(ps)
      if vote == "majority_vote":
          action = majority_vote(ps[0], ps[1], ps[2])
      elif vote == "average_prob":
          action = average_prob(ps[0], ps[1], ps[2])
      elif vote == "boltzmann_prob":
          action = boltzmann_prob(ps[0], ps[1], ps[2])
      else: raise Exception("Not implemented voting scheme")
    
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)

In [0]:
agents = [reinforce_agent, q_agent, sarsa]
r = []
for _ in range(20):
  print(_)
  r.append(ensembler_play(agents, env, 100))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(ensembler_play(agents, env, 100, 'average_prob'))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(ensembler_play(agents, env, 100, 'boltzmann_prob'))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
def learner_play1(learner, env, episodes, vote="majority_vote"):
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      _, p = learner.get_action(state)
      action = np.argmax(p)
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)


def learner_play2(learner, env, episodes, vote="majority_vote"):
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      p = learner.get_action(state, 500)
      (p + np.max(p) + 1)/np.sum(p + np.max(p) + 1)
      action = np.argmax(p)
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)

def learner_play3(learner, env, episodes, vote="majority_vote"):
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      q_values = learner.compute_q_values(state.reshape(1, 4))
      q_values = q_values.reshape((1, 2))
      probs=q_values[0]
      probs/=np.sum(probs)
      action = np.argmax(probs)
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(learner_play1(agents[0], env, 40))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  if _ % 100 == 0:
      print(_)
  r.append(learner_play2(q_agent, env, 100))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(learner_play3(sarsa, env, 100))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()