In [1]:
import torchvision
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import argparse
from gym.wrappers import Monitor
import datetime
import os
import torch
import numpy as np

from model import Model
from environment import Environment
from utils import Memory, EpsilonScheduler, make_log_dir, save_gif

In [2]:
torch.cuda.current_device()

0

In [3]:
def test(q_func,DEVICE,NUM_TEST,env,save=False):
    print("[TESTING]")
    total_reward = 0
    unclipped_reward = 0

    for i in range(NUM_TEST):
        if i == 0 and save:
            frames = []

        env.reset(eval=True) # performs random actions to start
        state, _, done, _ = env.step(env.action_space.sample())
        frame = 0

        while not done:
            if i == 0 and save:
                frames.append(state[0,0])
            
            # env.render()
            q_values = q_func(state.to(DEVICE))
            if np.random.random() > 0.01: # small epsilon-greedy, sometimes 0.05
                action = torch.argmax(q_values, dim=1).item()
            else:
                action = env.action_space.sample()

            lives = env.ale.lives()
            next_state, reward, done, info = env.step(action)
            if env.ale.lives() != lives: # lost life
                pass
                # plt.imshow(next_state[0,0])
                # plt.savefig(f"frame-{frame}.png")
                # print("LOST LIFE")

            unclipped_reward += info['unclipped_reward']
            total_reward += reward
            state = next_state
            frame += 1
            # print(f"[TESTING {frame}] Action: {action}, Q-Values: {np.array(q_values.cpu().detach())}, Reward: {reward}, Total Reward: {total_reward}, Terminal: {done}")
            # plt.imshow(state[0,0])
            # plt.savefig("frame-{}.png".format(frame))

        if i == 0 and save:
            frames.append(state[0,0])
            save_gif(frames, "{}.gif".format(os.path.join(video_dir, str(scheduler.step_count()))))

    total_reward /= NUM_TEST
    unclipped_reward /= NUM_TEST
    print(f"[TESTING] Total Reward: {total_reward}, Unclipped Reward: {unclipped_reward}")

    return total_reward

In [4]:
def run_net():
    
    weights_path='/home/maria/Documents/pytorch-dqn/weights/breakout/good.pt'
    
    MEM_SIZE = int(1e6) # this is either 250k or 1 million in the paper (size of replay memory)
    EPISODES = int(1e5) # total training episodes
    BATCH_SIZE = 32 # minibatch update size
    GAMMA = 0.99 # discount factor
    STORAGE_DEVICES = ['cpu'] # list of devices to use for episode storage (need about 10GB for 1 million memories)
    DEVICE = 'cpu' # list of devices for computation
    UPDATE_FREQ = 4 # perform minibatch update once every UPDATE_FREQ
    TARGET_UPDATE_EVERY = 10000 # in units of minibatch updates
    INIT_MEMORY_SIZE = 200000 # initial size of memory before minibatch updates begin

    TEST_EVERY = 1000 # (episodes)
    PLOT_EVERY = 10 # (episodes)
    SAVE_EVERY = 1000 # (episodes)
    EXPERIMENT_DIR = "experiments"
    NUM_TEST = 1
    GAME = 'breakout'
    
    env = Environment(game=GAME)
    #mem = Memory(MEM_SIZE, storage_devices=STORAGE_DEVICES, target_device=DEVICE)

    q_func = Model(env.action_space.n).to(DEVICE)
    q_func.load_state_dict(torch.load(weights_path,map_location='cuda:0'))

    target_q_func = Model(env.action_space.n).to(DEVICE)
    target_q_func.load_state_dict(q_func.state_dict())
    
    test(q_func,DEVICE,NUM_TEST,env)

In [5]:
run_net()

[TESTING]
[TESTING] Total Reward: 96.0, Unclipped Reward: 384.0
