## Watch a Smart Agent!

### 1.Start the Environment for Trained Agent

In [17]:
import numpy as np
import torch
import gym
import os
import time

from agent import Agent

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def rgb2gray(rgb, norm=True):
        # rgb image -> gray [0, 1]
    gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
    if norm:
        # normalize
        gray = gray / 128. - 1.
    return gray

seed = 0
img_stack = 4
action_repeat = 10
env = gym.make('CarRacing-v0', verbose=0)
state = env.reset()
reward_threshold = env.spec.reward_threshold

In [18]:
class Wrapper():
    """
    Environment wrapper for CarRacing 
    """

    def __init__(self, env):
        self.env = env  

    def reset(self):
        self.counter = 0
        self.av_r = self.reward_memory()

        self.die = False
        img_rgb = env.reset()
        img_gray = rgb2gray(img_rgb)
        self.stack = [img_gray] * img_stack  # four frames for decision
        return np.array(self.stack)

    def step(self, action):
        total_reward = 0
        for i in range(action_repeat):
            img_rgb, reward, die, _ = env.step(action)
            # don't penalize "die state"
            if die:
                reward += 100
            # green penalty
            if np.mean(img_rgb[:, :, 1]) > 185.0:
                reward -= 0.05
            total_reward += reward
            # if no reward recently, end the episode
            done = True if self.av_r(reward) <= -0.1 else False
            if done or die:
                break
        img_gray = rgb2gray(img_rgb)
        self.stack.pop(0)
        self.stack.append(img_gray)
        assert len(self.stack) == img_stack
        return np.array(self.stack), total_reward, done, die


    @staticmethod
    def reward_memory():
        # record reward for last 100 steps
        count = 0
        length = 100
        history = np.zeros(length)

        def memory(reward):
            nonlocal count
            history[count] = reward
            count = (count + 1) % length
            return np.mean(history)

        return memory
    
agent = Agent(device)

env_wrap = Wrapper(env)    

### 2. Prepare Load

In [19]:
def load(agent, directory, filename):
    agent.net.load_state_dict(torch.load(os.path.join(directory,filename)))

### 3. Prepare Player

In [20]:
from collections import deque
import os

def play(env, agent, n_episodes):
    state = env_wrap.reset()
    
    scores_deque = deque(maxlen=100)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        state = env_wrap.reset()        
        score = 0
        
        time_start = time.time()
        
        while True:
            action, a_logp = agent.select_action(state)
            env.render()
            next_state, reward, done, die = env_wrap.step( \
                action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))

            state = next_state
            score += reward
            
            if done or die:
                break 

        s = (int)(time.time() - time_start)
        
        scores_deque.append(score)
        scores.append(score)

        print('Episode {}\tAverage Score: {:.2f},\tScore: {:.2f} \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(scores_deque), score, s//3600, s%3600//60, s%60))  


### 3. Load and Play

In [21]:
load(agent, 'dir_chk', 'model_weights_0.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 49.17,	Score: 49.17 	Time: 00:00:03
Episode 2	Average Score: 242.92,	Score: 436.67 	Time: 00:00:12
Episode 3	Average Score: 288.93,	Score: 380.95 	Time: 00:00:11
Episode 4	Average Score: 282.13,	Score: 261.75 	Time: 00:00:10
Episode 5	Average Score: 249.01,	Score: 116.51 	Time: 00:00:07


In [22]:
load(agent, 'dir_chk', 'model_weights_400.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 364.38,	Score: 364.38 	Time: 00:00:10
Episode 2	Average Score: 478.91,	Score: 593.44 	Time: 00:00:11
Episode 3	Average Score: 343.40,	Score: 72.38 	Time: 00:00:03
Episode 4	Average Score: 421.84,	Score: 657.14 	Time: 00:00:11
Episode 5	Average Score: 387.87,	Score: 252.00 	Time: 00:00:07


In [23]:
load(agent, 'dir_chk', 'model_weights_600.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 727.27,	Score: 727.27 	Time: 00:00:11
Episode 2	Average Score: 735.01,	Score: 742.75 	Time: 00:00:11
Episode 3	Average Score: 605.58,	Score: 346.72 	Time: 00:00:09
Episode 4	Average Score: 622.98,	Score: 675.16 	Time: 00:00:12
Episode 5	Average Score: 641.44,	Score: 715.30 	Time: 00:00:11


In [24]:
load(agent, 'dir_chk', 'model_weights_800.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 946.97,	Score: 946.97 	Time: 00:00:11
Episode 2	Average Score: 883.74,	Score: 820.51 	Time: 00:00:11
Episode 3	Average Score: 729.25,	Score: 420.27 	Time: 00:00:07
Episode 4	Average Score: 799.59,	Score: 1010.60 	Time: 00:00:09
Episode 5	Average Score: 830.48,	Score: 954.02 	Time: 00:00:11


In [25]:
load(agent, 'dir_chk', 'model_weights_830.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 975.52,	Score: 975.52 	Time: 00:00:11
Episode 2	Average Score: 980.57,	Score: 985.61 	Time: 00:00:11
Episode 3	Average Score: 708.28,	Score: 163.71 	Time: 00:00:09
Episode 4	Average Score: 733.39,	Score: 808.72 	Time: 00:00:11
Episode 5	Average Score: 779.57,	Score: 964.29 	Time: 00:00:11


In [26]:
load(agent, 'dir_chk', 'model_weights_870.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 988.81,	Score: 988.81 	Time: 00:00:11
Episode 2	Average Score: 994.75,	Score: 1000.70 	Time: 00:00:11
Episode 3	Average Score: 996.60,	Score: 1000.30 	Time: 00:00:11
Episode 4	Average Score: 837.21,	Score: 359.05 	Time: 00:00:09
Episode 5	Average Score: 831.76,	Score: 809.94 	Time: 00:00:12


In [27]:
load(agent, 'dir_chk', 'model_weights_900.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 863.87,	Score: 863.87 	Time: 00:00:10
Episode 2	Average Score: 882.57,	Score: 901.27 	Time: 00:00:12
Episode 3	Average Score: 858.32,	Score: 809.82 	Time: 00:00:11
Episode 4	Average Score: 822.73,	Score: 715.98 	Time: 00:00:12
Episode 5	Average Score: 856.83,	Score: 993.24 	Time: 00:00:11


In [28]:
env.close()