In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.7/dist-packages (54.0.0)


In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

  
def query_environment(name):
  env = gym.make(name)
  spec = gym.spec(name)
  print(f"Action Space: {env.action_space}")
  print(f"Observation Space: {env.observation_space}")
  print(f"Max Episode Steps: {spec.max_episode_steps}")
  print(f"Nondeterministic: {spec.nondeterministic}")
  print(f"Reward Range: {env.reward_range}")
  print(f"Reward Threshold: {spec.reward_threshold}")

query_environment("Pendulum-v0")

Action Space: Box(-2.0, 2.0, (1,), float32)
Observation Space: Box(-8.0, 8.0, (3,), float32)
Max Episode Steps: 200
Nondeterministic: False
Reward Range: (-inf, inf)
Reward Threshold: None


In [None]:
class ReplayBuffer:

  def __init__(self, mem_size, *input_dims, state_size, action_space_size):
    self.mem_size = mem_size
    self.mem_cntr = 0
    self.state_size = state_size
    self.action_space_size = action_space_size
    self.state_memory = np.zeros((self.mem_size, self.state_size, 1), dtype=np.float32)
    self.new_state_memory = np.zeros((self.mem_size, self.state_size, 1), dtype=np.float32)
    self.action_memory = np.zeros((self.mem_size, self.action_space_size, ), dtype=np.int32)
    self.reward_memory = np.zeros(self.mem_size, dtype=np.int32)
    self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
    self.current_states = None

  def store_transition(self, state, action, reward, state_, done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.new_state_memory[index] = state_
    self.reward_memory[index] = reward
    self.action_memory[index] = action
    self.terminal_memory[index] = 1 - int(done)
    self.mem_cntr += 1


  def sample_buffer(self, batch_size):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace=False)
    states = self.state_memory[batch]
    self.current_states = states
    states_ = self.new_state_memory[batch]
    rewards = self.reward_memory[batch]
    actions = self.action_memory[batch]
    terminal = self.terminal_memory[batch]
    #actions = np.expand_dims(actions, axis=1)
    return states, actions, rewards, states_, terminal

  def sample_buffer_with_distance(self, batch_size, distance):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace=False)
    actions = []
    for index, value in enumerate(batch):
      should_pass = False
      avg_action = [0.0 for _ in range(self.action_space_size)]
      if abs(max_mem - 1 - value) <= distance:
        np.delete(batch, index)
        pass
      for i in range(distance):
        if self.terminal_memory[value + i] == True:
          np.delete(batch, index)
          should_pass = True
          break
      if should_pass == True:
        pass
      for i in range(distance):
        avg_action = avg_action + self.action_memory[value + i]
      avg_action = avg_action / distance
      actions.append(avg_action)
    states = self.state_memory[batch]
    self.current_states = states
    states_ = self.state_memory[batch + distance - 1]
    rewards = self.reward_memory[batch]
    terminal = self.terminal_memory[batch]
    return states, actions, rewards, states_, terminal


In [None]:
class Agent:
  def __init__(self, env, lr=0.01, state_size=4, action_size=2, optimal_state=[0.0, 0.0], gamma=0.01, epsilon=0.2, batch_size=5, num_epochs=1, epsilon_dec=1e-3, epsilon_end=0.01, mem_size=10000, layer_sizes=[64, 16], max_distance=5):
    self.env = env
    self.lr = lr
    self.state_size = state_size
    self.action_size = action_size
    self.layer_sizes = layer_sizes
    self.max_distance = max_distance
    self.state_and_state_to_action_model = self.create_state_and_state_to_action_model(self.layer_sizes, obs_space_size=self.state_size)
    #self.optimal_state = [0.0 for _ in range(self.state_size)] # cart position, angle
    self.optimal_state = optimal_state
    self.current_state = None
    self.selected_action = None
    self.prev_state = None
    self.gamma = gamma
    self.epsilon = epsilon
    self.batch_size = batch_size
    self.num_epochs = num_epochs
    self.epsilon_dec = epsilon_dec
    self.epsilon_end = epsilon_end
    self.mem_size = mem_size
    self.memory = ReplayBuffer(mem_size, (self.state_size, 1), state_size=self.state_size, action_space_size=self.action_size)
    



  def create_state_and_state_to_action_model(self, layer_sizes=[32, 16], obs_space_size=4):
    current_state_input = Input(shape=(obs_space_size, ), name='currentStateInput')
    target_state_input = Input(shape=(obs_space_size, ), name='targetStateInput')
    x = tf.keras.layers.concatenate([current_state_input, target_state_input], axis=1)
    hidden_1 = Dense(layer_sizes[0], activation='tanh')(x)
    hidden_2 = Dense(layer_sizes[1], activation='tanh')(hidden_1)
    action = Dense(self.action_size, activation='softmax', name='outputAction')(hidden_2)
    model = tf.keras.Model(
    inputs=[current_state_input, target_state_input],
    outputs=action,
    )
    model.compile(
    optimizer=tf.keras.optimizers.Adam(self.lr),
    loss=tf.keras.losses.CategoricalCrossentropy()
    )
    return model  
 


  def update_network_batched(self, max_distance, randomize):
    if self.memory.mem_cntr < self.batch_size:
      return
    distance = max_distance
    if randomize is True:
      distance = np.random.choice(range(2, max_distance+1))
    states, actions, rewards, states_, done = self.memory.sample_buffer_with_distance(batch_size=self.batch_size, distance=distance)
    self.state_and_state_to_action_model.fit(
    {"currentStateInput": states, "targetStateInput": states_},
    {"outputAction": np.expand_dims(np.array(actions), -1)},
    epochs=self.num_epochs,
    batch_size=self.batch_size,
    verbose=0
    )


  def store_transition(self, state, action, reward, new_state, done):
    self.memory.store_transition(np.expand_dims(state, axis=1), action, reward, np.expand_dims(new_state, axis=1) , done)


  def choose_action_from_the_states(self, currentState, targetState):
    if np.random.random() < self.epsilon:
      action = np.random.choice(self.action_size)
    else:
      state = np.array([currentState])
      predicted_action = self.state_and_state_to_action_model([{"targetStateInput": np.expand_dims(np.asarray(self.optimal_state, dtype=np.float32), axis=0), "currentStateInput": state}])
      self.selected_action = predicted_action
      action = np.argmax(predicted_action[0])
    return action

  def to_one_hot(self, action_index):
    vec = np.zeros((self.action_size,))
    vec[action_index] = 1
    return vec  

In [None]:
#env = wrap_env(gym.make("CartPole-v1"))
env = gym.make("CartPole-v1")
#env = gym.make("MountainCar-v0")
#To tune: num_games, batch_size, activation functions, distance, epsilon

num_games = 100
max_distance = 5
batch_size = 16
num_epochs = 1

#CartPole
state_size = 4
action_size = 2
optimal_state = [0.0, 0.0, 0.0, 0.0]

#MountainCar
# state_size = 2
# action_size = 3
# optimal_state = [-1.2, -0.07]

agent = Agent(env, batch_size=batch_size, state_size = state_size, action_size=action_size, optimal_state=optimal_state,
              num_epochs=num_epochs, lr=0.01, epsilon=0.05, epsilon_dec=0.01, epsilon_end=0.1, layer_sizes=[64, 64])
rewards = [0.0 for _ in range(num_games)]
steps = [0 for _ in range(num_games)]
# state -> array of 4: 0 cart position, 1 cart velocity, 2 angle, 3 velocity at tip
for index in range(num_games):
  current_state = env.reset()
  while True:
      #env.render()
      agent.prev_state = current_state
      selected_action_index = agent.choose_action_from_the_states(current_state, agent.optimal_state)
      agent.selected_action = agent.to_one_hot(selected_action_index)
      new_state, reward, done, info = env.step(selected_action_index)
      rewards[index] = rewards[index] + reward
      agent.current_state = new_state
      agent.store_transition( current_state, agent.selected_action, reward, new_state, done)
      current_state = new_state
      steps[index] += 1
      if done: 
        break;
  if agent.epsilon >= agent.epsilon_end + agent.epsilon_dec:
    agent.epsilon = agent.epsilon - agent.epsilon_dec

  agent.update_network_batched(max_distance, randomize=False)
  

#print(sum(steps) / num_games)
print(sum(rewards) / num_games)  
print(np.median(rewards))   
print(rewards)      
env.close()
#show_video()



142.53
110.5
[62.0, 35.0, 63.0, 104.0, 101.0, 104.0, 119.0, 66.0, 46.0, 89.0, 135.0, 142.0, 128.0, 92.0, 49.0, 83.0, 50.0, 113.0, 68.0, 78.0, 99.0, 129.0, 157.0, 253.0, 271.0, 313.0, 254.0, 318.0, 142.0, 138.0, 104.0, 121.0, 148.0, 176.0, 74.0, 96.0, 94.0, 107.0, 114.0, 106.0, 128.0, 148.0, 261.0, 110.0, 62.0, 50.0, 60.0, 67.0, 96.0, 65.0, 57.0, 43.0, 89.0, 126.0, 95.0, 52.0, 78.0, 55.0, 96.0, 95.0, 92.0, 113.0, 305.0, 209.0, 72.0, 52.0, 77.0, 100.0, 81.0, 88.0, 216.0, 500.0, 500.0, 500.0, 302.0, 393.0, 237.0, 151.0, 138.0, 212.0, 150.0, 161.0, 152.0, 307.0, 165.0, 320.0, 156.0, 158.0, 142.0, 144.0, 111.0, 85.0, 94.0, 92.0, 88.0, 129.0, 104.0, 229.0, 221.0, 133.0]
