# Imports

https://hallab.cs.dal.ca/images/0/00/Minh2015.pdf

In [1]:
import gym
from gym import wrappers
import numpy as np
import cv2
import tensorflow as tf
import os
import collections
import datetime
from statistics import mean
from ipywidgets import widgets
from IPython.display import display
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import animation
from pathlib import Path
# Implementation
import model

# Helper functions

In [2]:
def play_game(env, train_net, target_net, epsilon, copy_step):
    """
    Training algorithm for training two networks from scratch
    """
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    losses = list()
    while not done:
        action = train_net.get_action(observations, epsilon)
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward
        if done:
            reward = -200 # penalty for finishing before time
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        train_net.add_experience(exp)
        loss = train_net.train(target_net)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iter += 1
        if iter % copy_step == 0:
            target_net.copy_weights(train_net)
    return rewards, mean(losses)

In [3]:
def make_video(env, TrainNet):
    env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    rewards = 0
    steps = 0
    done = False
    observation = env.reset()
    while not done:
        action = TrainNet.get_action(observation, 0)
        observation, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
    print("Testing steps: {} rewards {}: ".format(steps, rewards))

In [48]:
# Taken from https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py
class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def step(self, action):
        return self.env.step(action)

    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


class ImageProcessing(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageProcessing, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
                                                shape=(old_shape[-1], old_shape[0], old_shape[1]),
                                                dtype=np.float32)
        #print(self.observation_space.shape)

    def observation(self, observation):
        #return observation
        # return np.moveaxis(observation, 2, 0)
        #obs = np.moveaxis(observation, 2, 0)
        return np.moveaxis(observation, 2, 0)
        #return np.moveaxis(obs, -2, 0)

class ChannelLast(gym.ObservationWrapper):
    def __init__(self, env):
        super(ChannelLast, self).__init__(env)
        
        
    def observation(self, observation):
        old_shape = self.observation_space.shape
        self.observation_space = np.moveaxis(observation, -1, 0)
        self.observation_space = np.moveaxis(self.observation_space, 0, 1)
        # np.reshape(self.observation_space, (old_shape[1], old_shape[2], old_shape[0]))
        print(old_shape)
        print(self.observation_space.shape)
    
class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = ImageProcessing(env)
    env = BufferWrapper(env, 4)
    return ChannelLast(ScaledFloatFrame(env))

In [58]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceReplay:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)

In [59]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, model, epsilon=0.0):

        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            #state_v = torch.tensor(state_a).to(device)
            
            q_vals_v = model(state_v)
            #_, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

# Hyperparameter definition

## File system

In [49]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = Path('logs/dqn/' + current_time)
output_dir = Path("./output")
output_dir.mkdir(parents=True, exist_ok=True)
summary_writer = tf.summary.create_file_writer(str(log_dir))

In [50]:
env = make_env('Pong-v4')

In [51]:
# Environment parameters
#env = gym.make('CartPole-v0')
#env = gym.make('Pong-v4')
num_states = len(env.observation_space.sample())
num_actions = env.action_space.n
print(f"Number of states: {num_states}")
print(f"Observation space shape: {env.observation_space.shape}")
print(f"Number of actions: {num_actions}")
print(f"Action meanings: {env.unwrapped.get_action_meanings()}")


Number of states: 4
Observation space shape: (4, 84, 84)
Number of actions: 6
Action meanings: ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


In [71]:
# DQN hyperparameters
gamma = 0.99
copy_step = 45
hidden_units = [250, 250]
max_experiences = 10000
min_experiences = 100
batch_size = 64
lr = 1e-3 #1e-2
epochs = 500 #iterations
total_rewards = np.empty(epochs)
epsilon = 0.99
decay = 0.9999
replay_size=100
eps_min = 0.1

In [53]:
exp_name_train = "train_test_1"
exp_name_target = "target_test_1"

In [54]:
input_shape = (484, 84, 4)

In [55]:
# Model definition
model_train = model.CNN_Model(input_shape, num_states, hidden_units, num_actions)
model_target = model.CNN_Model(input_shape, num_states, hidden_units, num_actions)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 10, 10, 64)        32832     
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 8, 8, 64)          36928     
_________________________________________________________________
dense_16 (Dense)             (None, 8, 8, 512)         33280     
_________________________________________________________________
re_lu_8 (ReLU)               (None, 8, 8, 512)         0         
_________________________________________________________________
dense_17 (Dense)             (None, 8, 8, 6)           3078      
Total params: 114,342
Trainable params: 114,342
Non-trainable params: 0
________________________________________________

In [56]:
# Model instantiation
train_net = model.DQN(exp_name_train, output_dir, model_train, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
target_net = model.DQN(exp_name_target, output_dir, model_target, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)

# New training

In [70]:
buffer = ExperienceReplay(replay_size)
agent = Agent(env, buffer)


optimizer = tf.keras.optimizers.Adam(lr=lr)
total_rewards = []
frame_idx = 0  
best_mean_reward = None

(4, 84, 84)
(4, 84, 84)


In [None]:
while True:
        frame_idx += 1
        epsilon = max(epsilon*eps_decay, eps_min)

        reward = agent.play_step(net, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)

            mean_reward = np.mean(total_rewards[-100:])

            print("%d:  %d games, mean reward %.3f, (epsilon %.2f)" % (
                frame_idx, len(total_rewards), mean_reward, epsilon))
            
            print("epsilon", epsilon, frame_idx)
            print("reward_100", mean_reward, frame_idx)
            print("reward", reward, frame_idx)

            if best_mean_reward is None or best_mean_reward < mean_reward:
                model.save_model(frame_idx)
                best_mean_reward = mean_reward
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f" % (best_mean_reward))

            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        if len(buffer) < replay_start_size:
            continue

        batch = buffer.sample(batch_size)
        states, actions, rewards, dones, next_states = batch

        states_v = torch.tensor(states).to(device)
        next_states_v = torch.tensor(next_states).to(device)
        actions_v = torch.tensor(actions).to(device)
        rewards_v = torch.tensor(rewards).to(device)
        done_mask = torch.ByteTensor(dones).to(device)

        state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)

        next_state_values = target_net(next_states_v).max(1)[0]

        next_state_values[done_mask] = 0.0

        next_state_values = next_state_values.detach()

        expected_state_action_values = next_state_values * gamma + rewards_v

        loss_t = nn.MSELoss()(state_action_values, expected_state_action_values)

        optimizer.zero_grad()
        loss_t.backward()
        optimizer.step()

        if frame_idx % sync_target_frames == 0:
            model_target.load_state_dict(net.state_dict())

# Training

In [57]:
for n in range(epochs):
    epsilon = max(min_epsilon, epsilon * decay)
    total_reward, losses = play_game(env, train_net, target_net, epsilon, copy_step)
    total_rewards[n] = total_reward
    avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
    with summary_writer.as_default():
        tf.summary.scalar('episode reward', total_reward, step=n)
        tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
        tf.summary.scalar('average loss)', losses, step=n)
    if n % 100 == 0:
        print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards,
              "episode loss: ", losses)
print("avg reward for last 100 episodes:", avg_rewards)

(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 84, 84)
(4, 

ValueError: Input 0 of layer sequential_8 is incompatible with the layer: : expected min_ndim=4, found ndim=2. Full shape received: (1, 1)

# Make video

In [None]:
env = gym.make('CartPole-v0')
make_video(env, target_net)
env.close()

# Tests