# Imports

In [1]:
import gym
from gym import wrappers
import numpy as np
import cv2
import tensorflow as tf
import os
import datetime
from statistics import mean
from ipywidgets import widgets
from IPython.display import display
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import animation
from pathlib import Path
# Implementation
import model

# Helper functions

In [2]:
def play_game(env, train_net, target_net, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    losses = list()
    while not done:
        action = train_net.get_action(observations, epsilon)
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward
        if done:
            reward = -200 # penalty for finishing before time
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        train_net.add_experience(exp)
        loss = train_net.train(target_net)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iter += 1
        if iter % copy_step == 0:
            target_net.copy_weights(train_net)
    return rewards, mean(losses)

In [3]:
def make_video(env, TrainNet):
    env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    rewards = 0
    steps = 0
    done = False
    observation = env.reset()
    while not done:
        action = TrainNet.get_action(observation, 0)
        observation, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
    print("Testing steps: {} rewards {}: ".format(steps, rewards))

# Hyperparameter definition

## File system

In [4]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = Path('logs/dqn/' + current_time)
output_dir = Path("./output")
output_dir.mkdir(parents=True, exist_ok=True)
summary_writer = tf.summary.create_file_writer(str(log_dir))

In [6]:
# Environment parameters
env = gym.make('CartPole-v0')
num_states = len(env.observation_space.sample())
num_actions = env.action_space.n
print(f"Number of states: {num_states}")
print(f"Number of actions: {num_actions}")
#print(f"{env.unwrapped.get_action_meanings()}")

Number of states: 4
Number of actions: 2


In [62]:
# DQN hyperparameters
gamma = 0.99
copy_step = 45
hidden_units = [250, 250]
max_experiences = 10000
min_experiences = 100
batch_size = 64
lr = 1e-3 #1e-2
epochs = 500 #iterations
total_rewards = np.empty(epochs)
epsilon = 0.99
decay = 0.9999
min_epsilon = 0.1

In [63]:
exp_name_train = "train_test_0"
exp_name_target = "target_test_0"

In [64]:
# Model instantiation
train_net = model.DQN(exp_name_train, output_dir, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
target_net = model.DQN(exp_name_target, output_dir, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)

# Training

In [65]:
for n in range(epochs):
    epsilon = max(min_epsilon, epsilon * decay)
    total_reward, losses = play_game(env, train_net, target_net, epsilon, copy_step)
    total_rewards[n] = total_reward
    avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
    with summary_writer.as_default():
        tf.summary.scalar('episode reward', total_reward, step=n)
        tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
        tf.summary.scalar('average loss)', losses, step=n)
    if n % 100 == 0:
        print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards,
              "episode loss: ", losses)
print("avg reward for last 100 episodes:", avg_rewards)

episode: 0 episode reward: 27.0 eps: 0.989901 avg reward (last 100): 27.0 episode loss:  0
episode: 100 episode reward: 14.0 eps: 0.980050830419928 avg reward (last 100): 21.02970297029703 episode loss:  944.9674
episode: 200 episode reward: 28.0 eps: 0.9702986765411791 avg reward (last 100): 20.683168316831683 episode loss:  731.7122
episode: 300 episode reward: 21.0 eps: 0.960643563042708 avg reward (last 100): 23.247524752475247 episode loss:  77.14855
episode: 400 episode reward: 12.0 eps: 0.9510845243085565 avg reward (last 100): 25.504950495049506 episode loss:  103.82956
avg reward for last 100 episodes: 25.306930693069308


# Make video

In [66]:
env = gym.make('CartPole-v0')
make_video(env, target_net)
env.close()

Testing steps: 200 rewards 200.0: 


# Tests

In [None]:
env = gym.make('SpaceInvaders-v4')
# env = gym.make('Pong-v4')
print(env.action_space)
print(env.observation_space.shape)
print()
for e in range(20):
    observation = env.reset()
    for t in range(1000):
        env.render()
        action = np.random.choice([0,1,2,3,4,5]) #env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print(f"Episode {e} finished after {t+1} timesteps")
            break
env.close()