# Assignment 1 - CATCH
TODO:
- Experience Replay Buffer - Done
- Mini Batch Training
- A Target Network
- Reward Clipping
- Loss Function Implementation
- CNN

Sources: \
https://github.com/yxu1168/Reinforcement-Learning-DQN-for-ATARI-s-Pong-Game---TensorFlow-2.0-Keras/blob/master/Reinforcement_Learning_CNN_DQN_for_ATARI%E2%80%99s_Pong_Game.ipynb

https://github.com/PacktPublishing/Hands-on-Reinforcement-Learning-with-PyTorch/blob/master/Section%203/3.5%20DQN%20with%20Pong.ipynb



## Libraries

In [169]:
import random

import matplotlib.pyplot as plt
import numpy as np
from skimage.transform import resize

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Catch

In [170]:
class CatchEnv():
    def __init__(self):
        self.size = 21
        self.image = np.zeros((self.size, self.size))
        self.state = []
        self.fps = 4
        self.output_shape = (84, 84)

    def reset_random(self):
        self.image.fill(0)
        self.pos = np.random.randint(2, self.size-2)
        self.vx = np.random.randint(5) - 2
        self.vy = 1
        self.ballx, self.bally = np.random.randint(self.size), 4
        self.image[self.bally, self.ballx] = 1
        self.image[-5, self.pos - 2:self.pos + 3] = np.ones(5)

        return self.step(2)[0]


    def step(self, action):
        def left():
            if self.pos > 3:
                self.pos -= 2
        def right():
            if self.pos < 17:
                self.pos += 2
        def noop():
            pass
        {0: left, 1: right, 2: noop}[action]()


        self.image[self.bally, self.ballx] = 0
        self.ballx += self.vx
        self.bally += self.vy
        if self.ballx > self.size - 1:
            self.ballx -= 2 * (self.ballx - (self.size-1))
            self.vx *= -1
        elif self.ballx < 0:
            self.ballx += 2 * (0 - self.ballx)
            self.vx *= -1
        self.image[self.bally, self.ballx] = 1

        self.image[-5].fill(0)
        self.image[-5, self.pos-2:self.pos+3] = np.ones(5)
    
        terminal = self.bally == self.size - 1 - 4
        reward = int(self.pos - 2 <= self.ballx <= self.pos + 2) if terminal else 0

        [self.state.append(resize(self.image, (84, 84))) for _ in range(self.fps - len(self.state) + 1)]
        self.state = self.state[-self.fps:]

        return np.transpose(self.state, [1, 2, 0]), reward, terminal

    def get_num_actions(self):
        return 3

    def reset(self):
        return self.reset_random()

    def state_shape(self):
        return (self.fps,) + self.output_shape

## Experience Replay Buffer

In [171]:
class ReplayBuffer():
    def __init__(self, max_size = 1e6):
        self.buffer = []
        self.max_size = max_size
        self.overFlow = 0
    
    def add(self, data):
        if len(self.buffer) == self.max_size:
            self.buffer[self.overFlow] = data
            self.overFlow = (self.overFlow + 1) % self.max_size
        else:
            self.buffer.append(data)

    def sample(self, batch_size):
        indices = np.random.randint(0, len(self.buffer), size=batch_size)
        states, actions, rewards, states_next = [], [], [], []


        for i in indices:
            states, actions, rewards, states_next = self.buffer[i]

        return states, actions, rewards, states_next

## Deep Q Network

In [172]:
input_shape = (84, 84, 4)
action_size = 3

model = keras.Sequential()

model.add(keras.layers.Conv2D(32, kernel_size=8, strides=4, input_shape = input_shape, activation="relu"))
model.add(keras.layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"))
model.add(keras.layers.Conv2D(64, kernel_size=2, strides=1, activation="relu"))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation="relu"))
model.add(keras.layers.Dense(64, activation="relu"))
model.add(keras.layers.Dense(action_size))

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
mse_loss = keras.losses.mean_squared_error

model.compile(optimizer=optimizer, loss='mse')

model.summary()

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_72 (Conv2D)          (None, 20, 20, 32)        8224      
                                                                 
 conv2d_73 (Conv2D)          (None, 9, 9, 64)          32832     
                                                                 
 conv2d_74 (Conv2D)          (None, 8, 8, 64)          16448     
                                                                 
 flatten_24 (Flatten)        (None, 4096)              0         
                                                                 
 dense_72 (Dense)            (None, 64)                262208    
                                                                 
 dense_73 (Dense)            (None, 64)                4160      
                                                                 
 dense_74 (Dense)            (None, 3)               

## Agent

In [173]:
class Agent():
    def __init__(self):
        self.action_size = 3
        self.train_net = model
        self.target_net = model
        self.batch_size = 4

        self.gamma = 0.95

    # select an action based on epsilon greedy method
    def action(self, state, epsilon):
        
        if np.random.rand() <= epsilon:
            return random.randint(0, 2)
        else:
            return np.argmax(self.train_net.predict(state))

    # train the model
    def train(self, replay_buffer):

        # sample from the experience replay buffer
        states, actions, rewards, states_next = replay_buffer.sample(self.batch_size)

        # train Q
        train_q = self.train_net(states)

        # target Q
        target_q = rewards + self.gamma * tf.reduce_max(self.target_net(states_next), axis=-1)

        # loss
        loss = tf.reduce_mean(mse_loss(train_q, target_q))

        # update weights of train_net
        self.train_net.optimizer.minimize(loss, self.train_net.trainable_variables)

        return loss


    # update the target network
    def update():
        pass

## Main Loop

In [174]:
def run_environment():
    env = CatchEnv()
    agent = Agent()
    replay_buffer = ReplayBuffer()

    # hyperparameters
    episodes = 10
    epsilon = 0.1

    for episode in range(episodes):
        env.reset()
        state, reward, terminal = env.step(2)

        while not terminal:
            state = np.reshape(state, (1, 84, 84, 4))  
            
            action = agent.action(state, epsilon)
            state_next, reward, terminal = env.step(action)

            state_next = np.reshape(state_next, (1, 84, 84, 4)) 

            # add experience to replay buffer
            replay_buffer.add((state, reward, action, state_next))

            # train the agent
            loss = agent.train(replay_buffer)


            print("Loss: {}".format(loss))
            print("Reward obtained by the agent: {}".format(reward))
            state = np.squeeze(state)

            # plt.matshow(state)
            # plt.show()

            state = state_next

        print("End of the episode")

# Run

In [175]:
run_environment()



ValueError: `tape` is required when a `Tensor` loss is passed. Received: loss=4.041056156158447, tape=None.