In [0]:
!pip install keras-rl

In [0]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy.stats as sts
import seaborn as sns
import math
import rl

# Agents 

Classic Q-learning

In [0]:
from google.colab import drive
import os 

drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Ensemble RL")

from sarsa_agent import SARSAAgent
from reinforce import REINFORCE
from q_learning import CartPoleQAgent

In [0]:
def load_q_learning():
    agent = CartPoleQAgent()
    steps = agent.train()

    return agent 

q_agent = load_q_learning()

Deep REINFORCE

In [0]:
## Config ##
import tensorflow as tf
ENV="CartPole-v1"
RANDOM_SEED=1
N_EPISODES=500

# random seed (reproduciblity)
np.random.seed(RANDOM_SEED)
# tf.random.set_seed(RANDOM_SEED)

# set the env
env=gym.make(ENV) # env to import
env.seed(RANDOM_SEED)
env.reset() # reset to env 
reinforce_agent=REINFORCE(env)
reinforce_agent.load_model("./model.h5") #Available to download here: https://drive.google.com/open?id=16MYB_Hy_gdVlGn-ianKduIYOuU1BjK9L

Deep Sarsa

In [0]:
env = gym.make('CartPole-v1')
seed_val = 456
env.seed(seed_val)
np.random.seed(seed_val)

#Getting the state and action space
states = env.observation_space.shape[0]
actions = env.action_space.n

#Defining a Neural Network function for our Cartpole agent 
def agent(states, actions):
    """Creating a simple Deep Neural Network."""
    model = Sequential()
    model.add(Flatten(input_shape = (1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

#Getting our neural network
model = agent(states, actions)
#Defining SARSA Keras-RL agent: inputing the policy and the model
sarsa = SARSAAgent(model=model, nb_actions=actions, policy=EpsGreedyQPolicy())
#Compiling SARSA with mean squared error loss
sarsa.compile('adam', metrics=["mse"])

#Training the agent for 50000 steps
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1)

# Ensembling Method

In [0]:
def majority_vote(p1, p2, p3):
    '''
    Takes three different probability vectors in and outputs a randomly sampled 
    action from n_action according to majority voting scheme
    '''
    a = range(n_action)
    a1 = np.random.choice(a=a, p=p1)
    a2 = np.random.choice(a=a, p=p2)
    a3 = np.random.choice(a=a, p=p3)
    l = [a1, a2, a3]
    return max(set(l), key=l.count)

def average_prob(p1, p2, p3):
    '''
    Takes three different probability vectors in and outputs a randomly sampled 
    action from n_action with probability equals the average probability of the
    input vectors
    '''
    a = range(n_action)
    p = (p1 + p2 + p3)/3
    p = p/np.sum(p)
    a = np.random.choice(a=a, p=p)
    return a

def boltzmann_prob(p1, p2, p3, T=0.5):
    '''
    Takes three different probability vectors in and outputs a randomly sampled 
    action from n_action with probability equals the average probability of the 
    normalized exponentiated input vectors, with a temperature T controlling
    the degree of spread for the out vector
    '''
    a = range(n_action)
    boltz_ps = [np.exp(prob/T)/sum(np.exp(prob/T)) for prob in [p1, p2, p3]]
    p = (boltz_ps[0] + boltz_ps[1] + boltz_ps[2])/3
    p = p/np.sum(p)
    a = np.random.choice(a=a, p=p)
    return a

In [0]:
n_action = 2
def ensembler_play(learners, env, episodes, vote="majority_vote"):
  '''
  Takes in the agents, the environment and number of episodes to perform
  ensemble learning for some episodes of play from the environment
  '''
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      _, p = learners[0].get_action(state)
      ps.append(p)
      p = learners[1].get_action(state, 500)
      ps.append((p + np.max(p) + 1)/np.sum(p + np.max(p) + 1))
      q_values = learners[2].compute_q_values(state.reshape(1, 4))
      q_values = q_values.reshape((1, 2))
      probs=q_values[0]
      probs/=np.sum(probs)
      ps.append(probs)
      # print(ps)
      if vote == "majority_vote":
          action = majority_vote(ps[0], ps[1], ps[2])
      elif vote == "average_prob":
          action = average_prob(ps[0], ps[1], ps[2])
      elif vote == "boltzmann_prob":
          action = boltzmann_prob(ps[0], ps[1], ps[2])
      else: raise Exception("Not implemented voting scheme")
    
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)

In [0]:
agents = [reinforce_agent, q_agent, sarsa]
r = []
for _ in range(20):
  print(_)
  r.append(ensembler_play(agents, env, 100))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(ensembler_play(agents, env, 100, 'average_prob'))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(ensembler_play(agents, env, 100, 'boltzmann_prob'))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
def learner_play1(learner, env, episodes, vote="majority_vote"):
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      _, p = learner.get_action(state)
      action = np.argmax(p)
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)


def learner_play2(learner, env, episodes, vote="majority_vote"):
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      p = learner.get_action(state, 500)
      (p + np.max(p) + 1)/np.sum(p + np.max(p) + 1)
      action = np.argmax(p)
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)

def learner_play3(learner, env, episodes, vote="majority_vote"):
  rewards = []
  n_action = env.action_space.n
  for episode in range(episodes):
    ep_reward = 0
    done=False
    state=env.reset()
    ps = []
    while not done:
      q_values = learner.compute_q_values(state.reshape(1, 4))
      q_values = q_values.reshape((1, 2))
      probs=q_values[0]
      probs/=np.sum(probs)
      action = np.argmax(probs)
      next_state, reward, done,info=env.step(action)
      ep_reward += reward
      state=next_state

      if done:
        rewards.append(ep_reward)
        ep_reward = []
        env.reset()
  
  return np.mean(rewards)

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(learner_play1(agents[0], env, 40))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  if _ % 100 == 0:
      print(_)
  r.append(learner_play2(q_agent, env, 100))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()

In [0]:
r = []
for _ in range(20):
  print(_)
  r.append(learner_play3(sarsa, env, 100))

plt.figure(figsize=(12,8))
plt.hist(r)
plt.show()