In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install scipy

Collecting setuptools
[?25l  Downloading https://files.pythonhosted.org/packages/ed/16/e9f5c5b86696da09298ea10c32d68ad8ea21f888e45b11aa9e615adda6c9/setuptools-49.2.1-py3-none-any.whl (789kB)
[K     |████████████████████████████████| 798kB 4.6MB/s 
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[?25hInstalling collected packages: setuptools
  Found existing installation: setuptools 49.2.0
    Uninstalling setuptools-49.2.0:
      Successfully uninstalled setuptools-49.2.0
Successfully installed setuptools-49.2.1




In [None]:
import gym
import numpy as np
import scipy
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

  
def query_environment(name):
  env = gym.make(name)
  spec = gym.spec(name)
  print(f"Action Space: {env.action_space}")
  print(f"Observation Space: {env.observation_space}")
  print(f"Max Episode Steps: {spec.max_episode_steps}")
  print(f"Nondeterministic: {spec.nondeterministic}")
  print(f"Reward Range: {env.reward_range}")
  print(f"Reward Threshold: {spec.reward_threshold}")

query_environment("Pendulum-v0")

Action Space: Box(1,)
Observation Space: Box(3,)
Max Episode Steps: 200
Nondeterministic: False
Reward Range: (-inf, inf)
Reward Threshold: None


In [None]:
class ReplayBuffer:

  def __init__(self, mem_size, *input_dims, state_size, action_space_size):
    self.mem_size = mem_size
    self.mem_cntr = 0
    self.state_size = state_size
    self.action_space_size = action_space_size
    self.state_memory = np.zeros((self.mem_size, self.state_size, 1), dtype=np.float32)
    self.new_state_memory = np.zeros((self.mem_size, self.state_size, 1), dtype=np.float32)
    self.action_memory = np.zeros((self.mem_size, self.action_space_size, ), dtype=np.float32)
    self.reward_memory = np.zeros(self.mem_size, dtype=np.int32)
    self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
    self.current_states = None

  def store_transition(self, state, action, reward, state_, done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.new_state_memory[index] = state_
    self.reward_memory[index] = reward
    self.action_memory[index] = action
    self.terminal_memory[index] = 1 - int(done)
    self.mem_cntr += 1


  def sample_buffer(self, batch_size):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace=False)
    states = self.state_memory[batch]
    self.current_states = states
    states_ = self.new_state_memory[batch]
    rewards = self.reward_memory[batch]
    actions = self.action_memory[batch]
    terminal = self.terminal_memory[batch]
    #actions = np.expand_dims(actions, axis=1)
    return states, actions, rewards, states_, terminal

  def sample_buffer_with_distance(self, batch_size, distance):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace=False)
    actions = []
    for index, value in enumerate(batch):
      should_pass = False
      avg_action = [0.0 for _ in range(self.action_space_size)]
      if abs(max_mem - 1 - value) <= distance:
        np.delete(batch, index)
        pass
      for i in range(distance):
        if self.terminal_memory[value + i] is True:
          np.delete(batch, index)
          should_pass = True
          break
      if should_pass is True:
        pass
      for i in range(distance):
        avg_action = avg_action + self.action_memory[value + i]
      avg_action = avg_action / distance
      actions.append(avg_action)
    states = self.state_memory[batch]
    self.current_states = states
    states_ = self.state_memory[batch + distance - 1]
    rewards = self.reward_memory[batch]
    terminal = self.terminal_memory[batch]
    return states, actions, rewards, states_, terminal

  def sample_buffer_with_distance_for_recurrent(self, batch_size, distance):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem - distance - 1, batch_size, replace=False)
    actions = [[] for _ in range(len(batch))]
    for index, value in enumerate(batch):
      should_pass = False
      if abs(max_mem - 1 - value) <= distance:
        np.delete(batch, index)
        pass
      for i in range(distance):
        if self.terminal_memory[value + i] is True:
          np.delete(batch, index)
          should_pass = True
          break
      if should_pass is True:
        pass
      for i in range(distance):
        actions[index].append(self.action_memory[value + i])
    states = self.state_memory[batch]
    self.current_states = states
    actions = np.array(actions)
    actions = np.reshape(actions, (len(batch), distance*self.action_space_size,))
    states_ = self.state_memory[batch + distance - 1]
    rewards = self.reward_memory[batch]
    terminal = self.terminal_memory[batch]

    states_for_other_network = self.state_memory[batch]
    next_states_for_other_network = self.state_memory[batch + 1]
    actions_for_other_network = self.action_memory[batch]

    return states, actions, rewards, states_, terminal, states_for_other_network, next_states_for_other_network, actions_for_other_network


In [None]:
class Agent:
  def __init__(self, env, lr=0.01, state_size=4, action_size=2, optimal_state=[0.0, 0.0], gamma=0.01, epsilon=0.2, batch_size=5, num_epochs=1, epsilon_dec=1e-3, epsilon_end=0.01, mem_size=10000, layer_sizes=[64, 16], max_distance=5):
    self.env = env
    self.lr = lr
    self.state_size = state_size
    self.action_size = action_size
    self.layer_sizes = layer_sizes
    self.max_distance = max_distance
    self.action_and_state_to_state_model = self.create_action_and_state_to_state_model(layer_sizes=[64, 64], obs_space_size=4)
    self.state_and_state_to_action_model = self.create_state_and_state_to_action_model(self.layer_sizes, obs_space_size=self.state_size)
    self.state_and_state_to_action_recurrent_model = self.create_state_and_state_to_action_recurrent_model(self.layer_sizes, obs_space_size=self.state_size, time_distance=self.max_distance)
    #self.optimal_state = [0.0 for _ in range(self.state_size)] # cart position, angle
    self.optimal_state = optimal_state
    self.current_state = None
    self.selected_action = None
    self.prev_state = None
    self.gamma = gamma
    self.epsilon = epsilon
    self.batch_size = batch_size
    self.num_epochs = num_epochs
    self.epsilon_dec = epsilon_dec
    self.epsilon_end = epsilon_end
    self.mem_size = mem_size
    self.memory = ReplayBuffer(mem_size, (self.state_size, 1), state_size=self.state_size, action_space_size=self.action_size)
    

  def create_action_and_state_to_state_model(self, layer_sizes=[32, 16], obs_space_size=4):
    action_input = Input(shape=(self.action_size, ), name='actionInput')
    state_input = Input(shape=(obs_space_size, ), name='stateInput')
    x = tf.keras.layers.concatenate([action_input, state_input], axis=1)
    hidden_1 = Dense(layer_sizes[0], activation='tanh')(x)
    hidden_2 = Dense(layer_sizes[1], activation='tanh')(hidden_1)
    out = Dense(obs_space_size, activation='linear', name='outputState')(hidden_2)
    model = tf.keras.Model(
    inputs=[action_input, state_input],
    outputs=out,
    )

    model.compile(
    optimizer=tf.keras.optimizers.Adam(self.lr),
    loss=tf.keras.losses.MeanSquaredError()
    )
    return model


  def create_state_and_state_to_action_model(self, layer_sizes=[32, 16], obs_space_size=4):
    current_state_input = Input(shape=(obs_space_size, ), name='currentStateInput')
    target_state_input = Input(shape=(obs_space_size, ), name='targetStateInput')
    x = tf.keras.layers.concatenate([current_state_input, target_state_input], axis=1)
    hidden_1 = Dense(layer_sizes[0], activation='tanh')(x)
    hidden_2 = Dense(layer_sizes[1], activation='tanh')(hidden_1)
    action = Dense(self.action_size, activation='softmax', name='outputAction')(hidden_2)
    model = tf.keras.Model(
    inputs=[current_state_input, target_state_input],
    outputs=action,
    )
    model.compile(
    optimizer=tf.keras.optimizers.Adam(self.lr),
    loss=tf.keras.losses.CategoricalCrossentropy()
    )
    return model  
    

  def create_state_and_state_to_action_recurrent_model(self, layer_sizes=[32, 16], obs_space_size=4, time_distance=5):
    current_state_input = Input(shape=(obs_space_size, ), name='currentStateInput')
    target_state_input = Input(shape=(obs_space_size, ), name='targetStateInput')
    x = tf.keras.layers.concatenate([current_state_input, target_state_input], axis=1)
    hidden_1 = Dense(layer_sizes[0], activation='tanh')(x)
    hidden_2 = Dense(layer_sizes[1], activation='tanh')(hidden_1)
    hidden_3 = Dense(32, activation='tanh')(hidden_2)
    #hidden_4 = BatchNormalization()(hidden_3)
    action = Dense(time_distance*self.action_size, activation='sigmoid', name='outputAction')(hidden_3)
    model = tf.keras.Model(
    inputs=[current_state_input, target_state_input],
    outputs=action,
    )
    model.compile(
    optimizer=tf.keras.optimizers.Adam(self.lr),
    loss=tf.keras.losses.MeanAbsoluteError()
    )
    return model  


  def update_network_batched(self, max_distance, randomize):
    if self.memory.mem_cntr < self.batch_size:
      return
    distance = max_distance
    if randomize is True:
      distance = np.random.choice(range(2, max_distance+1))
    states, actions, rewards, states_, done = self.memory.sample_buffer_with_distance(batch_size=self.batch_size, distance=distance)
    self.state_and_state_to_action_model.fit(
    {"currentStateInput": states, "targetStateInput": states_},
    {"outputAction": np.expand_dims(np.array(actions), -1)},
    epochs=self.num_epochs,
    batch_size=self.batch_size,
    verbose=0
    )

  def update_network_batched_recurrent(self, max_distance, randomize):
    if self.memory.mem_cntr < self.batch_size:
      return
    distance = max_distance
    if randomize is True:
      distance = np.random.choice(range(2, max_distance+1))
    states, actions, rewards, states_, done, states_for_other_network, next_states_for_other_network, actions_for_other_network = self.memory.sample_buffer_with_distance_for_recurrent(batch_size=self.batch_size, distance=distance)

    self.action_and_state_to_state_model.fit(
    {"actionInput": np.asarray(actions_for_other_network), "stateInput": np.asarray(states_for_other_network)},
    {"outputState": np.asarray(next_states_for_other_network)},
    epochs=self.num_epochs,
    batch_size=self.batch_size,
    verbose=0
    )


    self.state_and_state_to_action_recurrent_model.fit(
    {"currentStateInput": states, "targetStateInput": states_},
    {"outputAction": np.expand_dims(np.array(actions), -1)},
    epochs=self.num_epochs,
    batch_size=self.batch_size,
    verbose=0
    )


  def store_transition(self, state, action, reward, new_state, done):
    self.memory.store_transition(np.expand_dims(state, axis=1), action, reward, np.expand_dims(new_state, axis=1) , done)


  def choose_action_from_the_states(self, currentState, targetState):
    if np.random.random() < self.epsilon:
      action = np.random.choice(self.action_size)
    else:
      state = np.array([currentState])
      predicted_action = self.state_and_state_to_action_model([{"targetStateInput": np.expand_dims(np.asarray(self.optimal_state, dtype=np.float32), axis=0), "currentStateInput": state}])
      self.selected_action = predicted_action
      action = np.argmax(predicted_action[0])
    return action

  def choose_actions_from_the_states(self, currentState, targetState, distance):
    if np.random.random() < self.epsilon:
      actions = []
      for _ in range(distance):
        action = np.random.choice(self.action_size)
        actions.append(action)
      return actions
    else:
      state = np.array([currentState])
      predicted_actions = self.state_and_state_to_action_recurrent_model([{"targetStateInput": np.expand_dims(np.asarray(self.optimal_state, dtype=np.float32), axis=0), "currentStateInput": state}])
      predicted_actions = np.reshape(np.array(predicted_actions), (distance, self.action_size))
      predicted_actions_final = map(lambda actions: np.argmax(actions), predicted_actions)
      return list(predicted_actions_final)

  def to_one_hot(self, action_index):
    vec = np.zeros((self.action_size,))
    vec[action_index] = 1
    return vec  

In [None]:
import math
#env = wrap_env(gym.make("CartPole-v1"))
env = gym.make("CartPole-v1")
#env = gym.make("MountainCar-v0")
#To tune: num_games, batch_size, activation functions, distance, epsilon

num_games = 500
max_distance = 3
batch_size = 16
num_epochs = 1

#CartPole
state_size = 4
action_size = 2
optimal_state = [0.0, 0.0, 0.0, 0.0]


#MountainCar
# state_size = 2
# action_size = 3
# optimal_state = [-1.2, -0.07]

agent = Agent(env, batch_size=batch_size, state_size = state_size, action_size=action_size, optimal_state=optimal_state,
              num_epochs=num_epochs, lr=0.0001, epsilon=0.20, epsilon_dec=0.90, epsilon_end=0.10, layer_sizes=[128, 128], max_distance=max_distance)
rewards = [0.0 for _ in range(num_games)]
steps = [0 for _ in range(num_games)]
# state -> array of 4: 0 cart position, 1 cart velocity, 2 angle, 3 velocity at tip
for index in range(num_games):
  current_state = env.reset()
  # if index < 50:
  #   env.state = np.array(optimal_state)
  #env.state = np.array([0.0, 0.0, 0.0, 0.0])
  should_reset = False
  while True:
      #env.render()
      agent.prev_state = current_state
      selected_actions_array = agent.choose_actions_from_the_states(current_state, agent.optimal_state, max_distance)
      for selected_action_index in selected_actions_array:
        agent.selected_action = agent.to_one_hot(selected_action_index)
        new_state, reward, done, info = env.step(selected_action_index)
        rewards[index] = rewards[index] + reward
        agent.current_state = new_state
        agent.store_transition( current_state, agent.selected_action, reward, new_state, done)
        current_state = new_state
        steps[index] += 1
        if done:
          should_reset = True 
          break;
        if agent.epsilon >= agent.epsilon_end:
          agent.epsilon = agent.epsilon * agent.epsilon_dec
      if should_reset:
        break
  
  agent.update_network_batched_recurrent(max_distance, randomize=False)
  

#print(sum(steps) / num_games)
#print(sum(rewards[math.ceil(len(rewards) / 2):]) / num_games)  
print(np.median(rewards[math.ceil(len(rewards) / 2):]))   
print(rewards[math.ceil(len(rewards) / 2):])      
env.close()
#show_video()

48.5
[27.0, 29.0, 20.0, 79.0, 40.0, 55.0, 34.0, 12.0, 47.0, 57.0, 48.0, 46.0, 39.0, 96.0, 42.0, 63.0, 46.0, 54.0, 43.0, 41.0, 84.0, 63.0, 65.0, 30.0, 45.0, 65.0, 87.0, 34.0, 42.0, 47.0, 49.0, 64.0, 37.0, 42.0, 32.0, 65.0, 78.0, 45.0, 64.0, 59.0, 50.0, 31.0, 71.0, 47.0, 57.0, 52.0, 59.0, 41.0, 40.0, 110.0, 35.0, 40.0, 41.0, 22.0, 40.0, 23.0, 50.0, 113.0, 52.0, 66.0, 36.0, 61.0, 62.0, 48.0, 78.0, 29.0, 53.0, 58.0, 25.0, 35.0, 76.0, 47.0, 42.0, 28.0, 73.0, 71.0, 54.0, 56.0, 71.0, 102.0, 34.0, 80.0, 41.0, 43.0, 47.0, 30.0, 87.0, 40.0, 57.0, 83.0, 40.0, 70.0, 34.0, 60.0, 48.0, 49.0, 38.0, 24.0, 62.0, 28.0, 60.0, 34.0, 29.0, 28.0, 34.0, 18.0, 55.0, 60.0, 50.0, 31.0, 76.0, 38.0, 58.0, 55.0, 46.0, 45.0, 46.0, 51.0, 82.0, 40.0, 98.0, 38.0, 52.0, 72.0, 40.0, 77.0, 50.0, 27.0, 128.0, 46.0, 54.0, 55.0, 66.0, 40.0, 25.0, 66.0, 57.0, 70.0, 47.0, 60.0, 65.0, 35.0, 69.0, 127.0, 110.0, 38.0, 107.0, 51.0, 60.0, 40.0, 28.0, 55.0, 40.0, 55.0, 21.0, 30.0, 55.0, 86.0, 77.0, 102.0, 23.0, 47.0, 40.0, 61.0, 68

In [None]:
from scipy.spatial import distance
#env = wrap_env(gym.make("CartPole-v1"))
env = gym.make("CartPole-v1")
#env = gym.make("MountainCar-v0")
#To tune: num_games, batch_size, activation functions, distance, epsilon

num_games = 100
max_distance = 4
batch_size = 16
num_epochs = 1

#CartPole
state_size = 4
action_size = 2
optimal_state = [0.0, 0.0, 0.0, 0.0]

#MountainCar
# state_size = 2
# action_size = 3
# optimal_state = [-1.2, -0.07]

def step(agent, current_state, should_reset, current_index, rewards, steps, step_depth):
  if should_reset:
    return True
  agent.prev_state = current_state
  selected_actions_array = agent.choose_actions_from_the_states(current_state, agent.optimal_state, max_distance)
  for selected_action_index in selected_actions_array:
    agent.selected_action = agent.to_one_hot(selected_action_index)
    predicted_next_state = agent.action_and_state_to_state_model([{"actionInput": np.array([agent.selected_action]), "stateInput": np.array([agent.prev_state])}])
    new_state, reward, done, info = env.step(selected_action_index)
    if done:
      should_reset = True 
      return True;
    # difference_between_states = scipy.spatial.distance.euclidean(np.expand_dims(new_state, axis=1), np.transpose(np.array(predicted_next_state)))
    # percentage_error = difference_between_states / scipy.linalg.norm(new_state)
    cosine_similarity = 1 - scipy.spatial.distance.cosine(np.expand_dims(new_state, axis=1), np.transpose(np.array(predicted_next_state)))
    if( (cosine_similarity < 0.85) and step_depth < 2):
      step_depth += 1
      should_break = step(agent, new_state, should_reset, current_index, rewards, steps, step_depth)
      rewards[index] = rewards[index] + reward
      agent.current_state = new_state
      agent.store_transition( current_state, agent.selected_action, reward, new_state, done)
      current_state = new_state
      steps[index] += 1
      if agent.epsilon >= agent.epsilon_end:
        agent.epsilon = agent.epsilon * agent.epsilon_dec
      if should_break:
        return True
    rewards[index] = rewards[index] + reward
    agent.current_state = new_state
    agent.store_transition( current_state, agent.selected_action, reward, new_state, done)
    current_state = new_state
    steps[index] += 1
    if agent.epsilon >= agent.epsilon_end:
      agent.epsilon = agent.epsilon * agent.epsilon_dec
  if should_reset:
    return True
  else:
    return False


agent = Agent(env, batch_size=batch_size, state_size = state_size, action_size=action_size, optimal_state=optimal_state,
              num_epochs=num_epochs, lr=0.00001, epsilon=0.05, epsilon_dec=0.90, epsilon_end=0.10, layer_sizes=[64, 64], max_distance=max_distance)
rewards = [0.0 for _ in range(num_games)]
steps = [0 for _ in range(num_games)]
# state -> array of 4: 0 cart position, 1 cart velocity, 2 angle, 3 velocity at tip
for index in range(num_games):
  current_state = env.reset()
  should_reset = False
  while True:
      #env.render()
      step_depth = 0
      should_break = step(agent, current_state, should_reset, index, rewards, steps, step_depth)
      if should_break or should_reset:
        break
  
  agent.update_network_batched_recurrent(max_distance, randomize=False)
  

#print(sum(steps) / num_games)
print(sum(rewards) / num_games)  
print(np.median(rewards))   
print(rewards)      
env.close()
#show_video()

44.59
39.0
[97.0, 49.0, 112.0, 21.0, 27.0, 36.0, 19.0, 19.0, 49.0, 23.0, 22.0, 39.0, 50.0, 26.0, 81.0, 20.0, 22.0, 24.0, 31.0, 23.0, 14.0, 43.0, 22.0, 49.0, 39.0, 58.0, 60.0, 24.0, 26.0, 128.0, 84.0, 50.0, 40.0, 20.0, 23.0, 42.0, 58.0, 74.0, 34.0, 91.0, 52.0, 50.0, 49.0, 32.0, 51.0, 38.0, 18.0, 71.0, 22.0, 49.0, 45.0, 41.0, 77.0, 22.0, 26.0, 38.0, 30.0, 77.0, 61.0, 22.0, 71.0, 62.0, 47.0, 26.0, 39.0, 34.0, 62.0, 21.0, 46.0, 26.0, 99.0, 39.0, 58.0, 23.0, 47.0, 40.0, 22.0, 26.0, 144.0, 40.0, 117.0, 21.0, 20.0, 27.0, 23.0, 55.0, 23.0, 113.0, 22.0, 40.0, 12.0, 22.0, 94.0, 46.0, 22.0, 14.0, 75.0, 37.0, 24.0, 40.0]
