# **First project - Reinforcement Learning**

In this assignment i will get to know and experiment with OpenAI Gym, a testbed for reinforcement learning algorithms containing environments in different difficulty levels. \
I will implement 2 algorithms: \
1. tabular Q-learning model on a simple environment.
2. Neural Network function approximator of the Q-value, using the basic DQN algorithm.

# Section 1 - Q-learning Algorithm


In [24]:
import gym
import numpy as np
import random
import matplotlib as plt

## **The Q-Learning Agent**
The Q-learning agent class will have two functions: \
**action** - The agent selects the action with the highest value from whatever state it is in (exploitation), and with some probability chooses a new action (exploration). \
**update_QL** - Update the transitions probability matrix based on the Q-function.

In [25]:
class QL_Agent:
  def __init__(self, env, discount_factor, learning_rate, epsilon):
    self.g = discount_factor
    self.lr = learning_rate
    self.epsilon = epsilon
    self.env = env
    self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
  
  def action(self, s):
    rand = random.uniform(0,1)
    if rand > self.epsilon:
      action = np.argmax(self.q_table[s,:])
    else:
      action = self.env.action_space.sample()
    return action
  
  def update_QL(self, state, action, reward, state_):
    self.q_table[state, action] = self.q_table[state, action] * (1- self.lr) + \
    self.lr * (reward + self.g * np.max(self.q_table[state_, :]))

## **Base Loop** - train the model

In [26]:
env = gym.make("FrozenLake-v0")

# Exporation parameters
max_epsilon = 1.0
min_epsilon = 0.1
decay_rate = 0.0005
agentQL = QL_Agent(env, discount_factor = 0.95, learning_rate = 0.1, epsilon = max_epsilon)

total_episodes = 50000
rewards = np.zeros((total_episodes))
for i in range(total_episodes):
  total_rewards = 0
  state = env.reset() #Initialize the Frozen-Lake MDP
  while True: # Full trajectory
    action = agentQL.action(state) # Apply epsilon-Greedy policy
    state_, reward, done, _ = env.step(action) # Observe the next state and rewrd

    agentQL.update_QL(state, action, reward, state_) # Value function update

    state = state_ # Next state

    total_rewards += reward

    # Adjust exploration rate
    agentQL.epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*i)
    if done:  # The agent reached to Hole or Goal step
      break
  
  rewards[i] = total_rewards

## Q-table after training

In [27]:
print(agentQL.q_table)

[[0.18307622 0.13974704 0.15696406 0.14920029]
 [0.07412679 0.10290819 0.0816559  0.1288266 ]
 [0.12448889 0.1176815  0.11536705 0.11814416]
 [0.07051886 0.0774682  0.07091989 0.10667671]
 [0.24607347 0.14364659 0.13595893 0.10287033]
 [0.         0.         0.         0.        ]
 [0.15742876 0.07468276 0.12095522 0.04590568]
 [0.         0.         0.         0.        ]
 [0.15379458 0.19556729 0.14237735 0.28247631]
 [0.25715105 0.38964815 0.18201849 0.19872703]
 [0.32397831 0.27137658 0.24956924 0.17362351]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.28907252 0.3017422  0.57138967 0.37253722]
 [0.52111572 0.5627462  0.75900611 0.53094951]
 [0.         0.         0.         0.        ]]


## evaluate our Q-learning agent after training
Once we've finished the agent's training, let's see how well he's learned to get to the Goal-state.

In [28]:
env.reset()
total_episodes = 300
rewards = np.zeros((total_episodes))
agentQL.epsilon = 0.0 # Greedy policy

for i in range(total_episodes):
  total_rewards = 0
  state = env.reset() #Initialize the Frozen-Lake MDP
  while True: # Full trajectory
    action = agentQL.action(state) # Apply epsilon-Greedy policy
    state_, reward, done, _ = env.step(action) # Observe the next state and rewrd

    # agentQL.update_QL(state, action, reward, state_) # Value function update

    state = state_ # Next state

    total_rewards += reward

    if done:  # The agent reached to Hole or Goal step
      env.render()
      break
  
  rewards[i] = total_rewards
  print("Evaluation Episode: " + str(i) + " --> Reward: " + str(rewards[i]))
  print("Mean Reward: " + str(np.sum(rewards / total_episodes)))

  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Evaluation Episode: 0 --> Reward: 0.0
Mean Reward: 0.0
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Evaluation Episode: 1 --> Reward: 0.0
Mean Reward: 0.0
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Evaluation Episode: 2 --> Reward: 0.0
Mean Reward: 0.0
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Evaluation Episode: 3 --> Reward: 0.0
Mean Reward: 0.0
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Evaluation Episode: 4 --> Reward: 1.0
Mean Reward: 0.0033333333333333335
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Evaluation Episode: 5 --> Reward: 0.0
Mean Reward: 0.0033333333333333335
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Evaluation Episode: 6 --> Reward: 1.0
Mean Reward: 0.006666666666666667
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Evaluation Episode: 7 --> Reward: 0.0
Mean Reward: 0.006666666666666667
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Evaluation Episode: 8 --> Reward: 1.0
Mean Reward: 0.01
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Evaluation Episode: 9 --> Reward: 1.0
Mean Reward:

In conclusion, it can be seen that the agent's probability of getting from the initial state to the final state is around 64%, this can be understood for the reason that the environment is stochastic.

# Section 2 - DQN algorithm

In [7]:
import random
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

## Neural network class
1. 5 hidden layers
2. relu activation function
3. MSE loss function using ADAM optimizer

In [40]:
def build_network(lr, net_arch):
    """
    Build NN

    :param lr: Learning rate
    :param net_arch: Network architecture
    :return: NN - Get state as input and return Q(s,a)
    """
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=net_arch['state_dim'], kernel_initializer='he_uniform'))
    model.add(Dense(units=32, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units=32, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units=24, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units=24, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units=net_arch['action_dim'], activation='linear', kernel_initializer='he_uniform'))

    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')
    model.summary()
    return model

## Replay Buffer Class
use to store and sample random batches of experiences for training the net.\
It is used in the algorithm to sample a minibatch of experiences in random order for the optimization process


In [31]:
class ReplayBuffer:
    def __init__(self, capacity):
        """
        :param capacity: The number of experience batches to hold
        """
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def store(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.position % self.capacity] = experience
        self.position += 1

    def sample(self, batch_size):
        mini_batch = random.sample(self.memory, batch_size)
        states = np.array([exp[0] for exp in mini_batch])
        actions = np.array([exp[1] for exp in mini_batch])
        next_states = np.array([exp[2] for exp in mini_batch])
        rewards = np.array([exp[3] for exp in mini_batch])
        not_dones = np.array([exp[4] for exp in mini_batch])
        return states, actions, next_states, rewards, not_dones

    def __len__(self):
        return len(self.memory)

## DQN_Agent Class - 
Will contain two neural networks - target and online. \
The online network is the network that actually practices and performs the weaving. \
The target network is the network that we freeze for a number of training iterations to avoid a non-stationary target situation.

In [46]:
class DQN_Agent:
    def __init__(self, hp, net_arch):
        """
        The DQN agent class. use to store experience, choose action, decaying epsilon, update the online network
        and test the agent

        :param hp: Hyper-parameters
        :param env_dim: Environment dimensions
        """
        self.hp = hp
        self.net_arch = net_arch
        self.epsilon = self.hp['max_epsilon']
        self.memory = ReplayBuffer(capacity=self.hp['capacity'])
        self.online_net = build_network(lr=self.hp['lr'], net_arch=self.net_arch)
        self.target_net = build_network(lr=self.hp['lr'], net_arch=self.net_arch)
        self.target_net.set_weights(self.online_net.get_weights())

    def store_experience(self, experience):
        self.memory.store(experience=experience)

    def choose_action(self, observation):
        if random.uniform(0, 1) < self.epsilon:
            a = np.random.choice(self.net_arch['action_dim'])  # Explore
        else:
            a = np.argmax(self.online_net.predict(x=np.array([observation])))  # Exploit
        self.epsilon_decay()
        return a

    def epsilon_decay(self):
        self.epsilon = max(self.hp['min_epsilon'], self.epsilon * self.hp['epsilon_decay'])

    def update_target_network(self):
        self.target_net.set_weights(self.online_net.get_weights())

    def update_step(self):
        self.epsilon_decay()
        if len(self.memory) < self.hp['batch_size']: return 0

        # Sample minibatch
        states, actions, next_states, rewards, not_dones = self.memory.sample(batch_size=self.hp['batch_size'])

        predicted_q = self.online_net.predict(states)  # Q(states_batch,:) from the online network
        target_q_next = self.target_net.predict(next_states)  # Q(next_states_batch,:) from the target network

        q_target = np.copy(predicted_q)  # Copy of Q(s,:) from the online network
        batch_index = np.arange(self.hp['batch_size'])

        # Q target = Q(s,a) = r + gamma * MAX[Q_target_net(s',A')] * not_dones
        q_target[batch_index, actions] = rewards + self.hp['gamma'] * np.amax(target_q_next, axis=1) * not_dones
        
        # Update online network to give q values as output from the suit states inputs
        loss = self.online_net.train_on_batch(states, q_target)
        return loss

    def test_agent(self, env, visualize='True'):
        s = env.reset()
        total_reward = 0.0
        finish = False

        while not finish:
            if visualize:
                env.render()
            q_vals = self.online_net.predict(np.array([s]))
            a = np.argmax(q_vals)
            s, r, finish, _ = env.step(a)
            total_reward += reward
        print("Total reward in the test: %.2f" % total_reward)
        env.close()

## **Base Loop** - train the model
The loop(training) will continue as long as the model averages at least 475 operations until it drops.

In [None]:
env = gym.make('CartPole-v1')
# Parameters
env_dim = {
        'state_dim': env.observation_space.shape[0],
        'action_dim': env.action_space.n}

# Hyper parameters
hp = {
        'lr': 0.0001,
        'batch_size': 128,
        'capacity': 10000,
        'gamma': 0.99,
        'max_epsilon': 0.9,
        'min_epsilon': 0.01,
        'epsilon_decay': 0.999,
        'target_update_period': 100}

agent = DQN_Agent(hp=hp, net_arch=env_dim)  # Build an agent object

max_episodes = 1000
max_steps = 500
max_score = 475.0
total_steps = 0
episode_loss = []
average_score = []
for episode in range(max_episodes):
    state = env.reset()
    done = False
    episode_score = 0
    for step in range(max_steps):
        b = True
        total_steps += 1
        action = agent.choose_action(observation=state)
        next_state, reward, done, _ = env.step(action)
        episode_score += reward

        agent.store_experience((state, action, next_state, reward, not done))
        if done:
            b = False
            average_score.append(episode_score)
            print("Episode: {} | Score: {}".format(episode + 1, episode_score))
            break

        state = next_state
        loss = agent.update_step()
        episode_loss.append(loss)

        if (total_steps + 1) % hp['target_update_period'] == 0:
          agent.update_target_network()
    
    if b:
      print("Episode: {} | Score: {}".format(episode + 1, episode_score))

    if np.mean(average_score[-100:]) >= max_score:
        print(
            "\nGreat!! "
            "You win after: {} Episodes\n"
            "Average reward in the last 100 episodes: {}".format(episode + 1, np.mean(average_score[-100:])))
        break
        
# save the target net
agent.target_net.save('my_model1.h5')

In [1]:
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [24]:
new_model = tf.keras.models.load_model('my_model.h5')

In [26]:
env = wrap_env(gym.make('CartPole-v1'))
s = env.reset()

while True:
    env.render()
    # The agent
    q_vals = new_model.predict(np.array([s])) 
    a = np.argmax(q_vals)
    s, r, finish, _ = env.step(a)
     
    if finish: 
      break;
            
env.close()
show_video()