## Imports

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

## DQN Algo Outline

In [3]:
from collections import namedtuple
import random
import time

import torch
from torch import nn
from torch import optim
import numpy as np

import gym
from src import agent, train, utils

In [4]:
random.seed(0)

## Explore

### Collect 10 random frames

In [5]:
replay_mem_size = int(1e6)
batch_size = 32
num_episodes = int(1e3)

In [6]:
def frames_to_phi(frames):
    frames = [utils.process_frame(s) 
              for s in frames]
    phi = torch.cat(frames)
    
    return phi

def get_rand_phis(k, n):
    frames = []
    env = gym.envs.make('Pong-v4')
    env.reset()
    
    for i in range(n):
        a = random.randrange(env.action_space.n)
        s_t1, r_t, done, _ = env.step(a)
        frames.append(s_t1)

        if done:
            s_t = env.reset()
    
    idxs = random.sample(range(3, n), k)
    phis = [frames_to_phi(frames[i-3:i+1]) for i in idxs]
    
    return phis

def get_frames_avg_qval(phis, agt):
    x = torch.stack(phis)
    qvals = agt.get_best_values(x)
    
    return torch.mean(qvals).item()

In [15]:
def initialize(replay_mem_size, batch_size):
    agt = agent.DQNAgent()
    replay_mem = utils.ReplayMemory(replay_mem_size, batch_size)
    obs_history = utils.ObsHistory()
    env = gym.envs.make('Pong-v4')
    train_stats = TrainingStats()
    
    return agt, replay_mem, obs_history, env, train_stats

def act_step(obs_history, agt, env):
    phi_t = obs_history.get_phi()
    a_t = agt.act(phi_t)
    s_t1, r_t, done, _ = env.step(a_t)
    
    return a_t, s_t1, r_t, done

def store_step(s_t, a_t, r_t, done, s_t1, obs_history, replay_mem):
    obs_history.store(s_t1)
    replay_mem.store(s_t, a_t, r_t, done)

def gradient_step(replay_mem, agt):
    if replay_mem.size > replay_mem.sample_size + 3:
        mini_batch = replay_mem.sample()

        agt.optimizer.zero_grad()
        loss = train.mini_batch_loss(mini_batch, agt)
        loss.backward()
        agt.optimizer.step()
        
        return loss.item()

def save_params(agt, episodes, save_path):
    torch.save({
        'model_state_dict': agt.qnet.state_dict(),
        'optimizer_state_dict': agt.optimizer.state_dict(),
        'episodes': episodes
    }, save_path)

def load_params(agt, load_path):
    checkpoint = torch.load(checkpoint_path)
    
    agt.qnet.load_state_dict(checkpoint['model_state_dict'])
    agt.opimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    return chekpoint['episodes']

def reset_episode(env, obs_history):
    s_t = env.reset()
    obs_history.reset(s_t)
    done = False
    
    return s_t, done

In [16]:
def frames_to_phi(frames):
    frames = [utils.process_frame(s) for s in frames]
    phi = torch.cat(frames)

    return phi

def get_rand_phis(k, n):
    frames = []
    env = gym.envs.make('Pong-v4')
    env.reset()

    for i in range(n):
        a = random.randrange(env.action_space.n)
        s_t1, r_t, done, _ = env.step(a)
        frames.append(s_t1)

        if done:
            s_t = env.reset()

    idxs = random.sample(range(3, n), k)
    phis = [frames_to_phi(frames[i-3:i+1]) for i in idxs]

    return phis

In [28]:
class TrainingStats:
    
    def __init__(self):
        self.ep_rewards = []
        self.ep_avg_train_losses = []
        self.steps_per_ep = []
        self.benchmark_qvals = []
        self.benchmark_frames = torch.stack(get_rand_phis(10, 10000))
        
    def store(self, agt, ep_reward, ep_steps, ep_loss, episode_num):
        self.ep_rewards.append(ep_reward)
        self.steps_per_ep.append(ep_steps)
        
        avg_ep_loss = ep_loss / ep_steps
        self.ep_avg_train_losses.append(avg_ep_loss)
        
        avg_qvals = self.get_frames_avg_qval(agt)
        self.benchmark_qvals.append(avg_qvals)
        
        print('Episode {}:'.format(episode_num))
        print('Reward: {}'.format(ep_reward))
        print('Steps: {}'.format(ep_steps))
        print('Avg loss: {:.5f}'.format(avg_ep_loss))
        print('Avg qvals: {:.5f}'.format(avg_qvals))
        print('===========================================')
        

    def get_frames_avg_qval(self, agt):
        qvals = agt.get_best_values(self.benchmark_frames)

        return torch.mean(qvals).item()

In [29]:
num_episodes = 10000

In [30]:
ep_rewards = []
ep_avg_train_losses = []
steps_per_ep = []
benchmark_qvals = []

In [31]:
agt, replay_mem, obs_history, env, train_stats = \
    initialize(replay_mem_size, batch_size)

for episode in range(num_episodes):  # loop over episodes
    s_t, done = reset_episode(env, obs_history)
    
    ep_reward = 0
    ep_train_loss = 0
    ep_steps = 0
    
    while not done:  # loop over steps in episode
        a_t, s_t1, r_t, done = act_step(obs_history, agt, env)
        store_step(s_t, a_t, r_t, done, s_t1, obs_history, replay_mem)
        
        s_t = s_t1
        
        loss_val = gradient_step(replay_mem, agt)
        
        ep_reward += r_t
        ep_steps += 1
        if loss_val is not None:
            ep_train_loss += loss_val 
    
    train_stats.store(agt, ep_reward, ep_steps, ep_train_loss, episode)
    
    if episode % 10 == 9:
        checkpoint_name = 'dqn_agt_{}.pt'.format(episode)
        save_params(agt, episode, checkpoint_name)
        print('Model saved.')
    

Episode 0:
Reward: -21.0
Steps: 1166
Avg loss: 0.01169
Avg qvals: -0.67752
Episode 1:
Reward: -21.0
Steps: 1163
Avg loss: 0.01747
Avg qvals: -1.73713
Episode 2:
Reward: -21.0
Steps: 1084
Avg loss: 0.01702
Avg qvals: -2.48010
Episode 3:
Reward: -21.0
Steps: 1005
Avg loss: 0.01802
Avg qvals: -1.26754
Episode 4:
Reward: -21.0
Steps: 1146
Avg loss: 0.01635
Avg qvals: -1.31768
Episode 5:
Reward: -19.0
Steps: 1249
Avg loss: 0.01854
Avg qvals: -1.03654
Episode 6:
Reward: -21.0
Steps: 1182
Avg loss: 0.01664
Avg qvals: -1.44374
Episode 7:
Reward: -20.0
Steps: 1199
Avg loss: 0.01664
Avg qvals: -1.61514
Episode 8:
Reward: -21.0
Steps: 1019
Avg loss: 0.01696
Avg qvals: -1.65590
Episode 9:
Reward: -21.0
Steps: 1093
Avg loss: 0.01862
Avg qvals: -2.41460
Model saved.
Episode 10:
Reward: -20.0
Steps: 1308
Avg loss: 0.01718
Avg qvals: -1.54633
Episode 11:
Reward: -21.0
Steps: 1227
Avg loss: 0.01787
Avg qvals: -1.73531
Episode 12:
Reward: -20.0
Steps: 1252
Avg loss: 0.01768
Avg qvals: -2.01598
Episode 1

Episode 68:
Reward: -21.0
Steps: 1035
Avg loss: 0.01713
Avg qvals: -1.61855
Episode 69:
Reward: -21.0
Steps: 1076
Avg loss: 0.01767
Avg qvals: -1.27528
Model saved.
Episode 70:
Reward: -21.0
Steps: 1164
Avg loss: 0.01743
Avg qvals: -1.48893
Episode 71:
Reward: -21.0
Steps: 1039
Avg loss: 0.01780
Avg qvals: -1.79250
Episode 72:
Reward: -21.0
Steps: 1061
Avg loss: 0.01779
Avg qvals: -1.55489
Episode 73:
Reward: -20.0
Steps: 1557
Avg loss: 0.01821
Avg qvals: -1.76433
Episode 74:
Reward: -21.0
Steps: 1131
Avg loss: 0.01727
Avg qvals: -1.71705
Episode 75:
Reward: -19.0
Steps: 1363
Avg loss: 0.01661
Avg qvals: -1.68427
Episode 76:
Reward: -19.0
Steps: 1348
Avg loss: 0.01803
Avg qvals: -1.59744
Episode 77:
Reward: -21.0
Steps: 1024
Avg loss: 0.01760
Avg qvals: -1.44396
Episode 78:
Reward: -20.0
Steps: 1451
Avg loss: 0.01795
Avg qvals: -1.55679
Episode 79:
Reward: -21.0
Steps: 1272
Avg loss: 0.01777
Avg qvals: -1.46793
Model saved.
Episode 80:
Reward: -21.0
Steps: 1048
Avg loss: 0.01723
Avg qv

Episode 136:
Reward: -21.0
Steps: 1022
Avg loss: 0.01722
Avg qvals: -1.34071
Episode 137:
Reward: -21.0
Steps: 1011
Avg loss: 0.01679
Avg qvals: -1.42551
Episode 138:
Reward: -21.0
Steps: 1026
Avg loss: 0.01667
Avg qvals: -1.53372
Episode 139:
Reward: -21.0
Steps: 1030
Avg loss: 0.01776
Avg qvals: -1.75186
Model saved.
Episode 140:
Reward: -21.0
Steps: 1019
Avg loss: 0.01871
Avg qvals: -1.58880
Episode 141:
Reward: -21.0
Steps: 1171
Avg loss: 0.01793
Avg qvals: -1.68215
Episode 142:
Reward: -21.0
Steps: 1105
Avg loss: 0.01841
Avg qvals: -1.57791
Episode 143:
Reward: -21.0
Steps: 1342
Avg loss: 0.01846
Avg qvals: -1.57415
Episode 144:
Reward: -20.0
Steps: 1250
Avg loss: 0.01700
Avg qvals: -1.62093
Episode 145:
Reward: -21.0
Steps: 1136
Avg loss: 0.01826
Avg qvals: -1.60971
Episode 146:
Reward: -20.0
Steps: 1171
Avg loss: 0.01764
Avg qvals: -1.79169
Episode 147:
Reward: -21.0
Steps: 1218
Avg loss: 0.01914
Avg qvals: -1.81584
Episode 148:
Reward: -18.0
Steps: 1385
Avg loss: 0.01733
Avg qv

Episode 203:
Reward: -20.0
Steps: 1112
Avg loss: 0.01763
Avg qvals: -1.47908
Episode 204:
Reward: -20.0
Steps: 1128
Avg loss: 0.01588
Avg qvals: -1.54764
Episode 205:
Reward: -21.0
Steps: 1195
Avg loss: 0.01782
Avg qvals: -1.65002
Episode 206:
Reward: -21.0
Steps: 1092
Avg loss: 0.01706
Avg qvals: -1.67360
Episode 207:
Reward: -21.0
Steps: 1088
Avg loss: 0.01744
Avg qvals: -1.56666
Episode 208:
Reward: -20.0
Steps: 1219
Avg loss: 0.01744
Avg qvals: -1.56931
Episode 209:
Reward: -20.0
Steps: 1139
Avg loss: 0.01779
Avg qvals: -1.71384
Model saved.
Episode 210:
Reward: -21.0
Steps: 1103
Avg loss: 0.01608
Avg qvals: -1.48564
Episode 211:
Reward: -20.0
Steps: 1121
Avg loss: 0.01656
Avg qvals: -1.63207
Episode 212:
Reward: -21.0
Steps: 1094
Avg loss: 0.01877
Avg qvals: -1.72821
Episode 213:
Reward: -20.0
Steps: 1127
Avg loss: 0.01676
Avg qvals: -1.50630
Episode 214:
Reward: -21.0
Steps: 1174
Avg loss: 0.01710
Avg qvals: -1.44992
Episode 215:
Reward: -21.0
Steps: 1126
Avg loss: 0.01888
Avg qv

Episode 270:
Reward: -20.0
Steps: 1122
Avg loss: 0.01805
Avg qvals: -1.47548
Episode 271:
Reward: -20.0
Steps: 1387
Avg loss: 0.01758
Avg qvals: -1.53755
Episode 272:
Reward: -20.0
Steps: 1360
Avg loss: 0.01753
Avg qvals: -1.49485
Episode 273:
Reward: -20.0
Steps: 1352
Avg loss: 0.01745
Avg qvals: -1.53560
Episode 274:
Reward: -20.0
Steps: 1119
Avg loss: 0.01843
Avg qvals: -1.68706
Episode 275:
Reward: -21.0
Steps: 1097
Avg loss: 0.01811
Avg qvals: -1.76823
Episode 276:
Reward: -20.0
Steps: 1175
Avg loss: 0.01735
Avg qvals: -1.82115
Episode 277:
Reward: -19.0
Steps: 1261
Avg loss: 0.01776
Avg qvals: -1.51873
Episode 278:
Reward: -20.0
Steps: 1259
Avg loss: 0.01806
Avg qvals: -1.70976
Episode 279:
Reward: -21.0
Steps: 1107
Avg loss: 0.01796
Avg qvals: -1.50011
Model saved.
Episode 280:
Reward: -20.0
Steps: 1277
Avg loss: 0.01760
Avg qvals: -1.60981
Episode 281:
Reward: -18.0
Steps: 1359
Avg loss: 0.01765
Avg qvals: -1.72821
Episode 282:
Reward: -20.0
Steps: 1397
Avg loss: 0.01802
Avg qv

Episode 338:
Reward: -20.0
Steps: 1226
Avg loss: 0.01658
Avg qvals: -1.50799
Episode 339:
Reward: -20.0
Steps: 1128
Avg loss: 0.01700
Avg qvals: -1.54687
Model saved.
Episode 340:
Reward: -21.0
Steps: 1175
Avg loss: 0.01849
Avg qvals: -1.80079
Episode 341:
Reward: -20.0
Steps: 1127
Avg loss: 0.01768
Avg qvals: -1.62542
Episode 342:
Reward: -18.0
Steps: 1453
Avg loss: 0.01750
Avg qvals: -1.70388
Episode 343:
Reward: -21.0
Steps: 1050
Avg loss: 0.01827
Avg qvals: -1.75215
Episode 344:
Reward: -21.0
Steps: 1082
Avg loss: 0.01754
Avg qvals: -1.56492
Episode 345:
Reward: -21.0
Steps: 1055
Avg loss: 0.01913
Avg qvals: -1.80896
Episode 346:
Reward: -21.0
Steps: 1129
Avg loss: 0.01842
Avg qvals: -1.68906
Episode 347:
Reward: -19.0
Steps: 1357
Avg loss: 0.01647
Avg qvals: -1.52050
Episode 348:
Reward: -21.0
Steps: 1099
Avg loss: 0.01694
Avg qvals: -1.58179
Episode 349:
Reward: -21.0
Steps: 1177
Avg loss: 0.01860
Avg qvals: -1.70238
Model saved.
Episode 350:
Reward: -20.0
Steps: 1232
Avg loss: 0

Episode 405:
Reward: -21.0
Steps: 1015
Avg loss: 0.01875
Avg qvals: -1.87985
Episode 406:
Reward: -21.0
Steps: 1018
Avg loss: 0.01808
Avg qvals: -1.65883
Episode 407:
Reward: -21.0
Steps: 1125
Avg loss: 0.01676
Avg qvals: -1.50316
Episode 408:
Reward: -21.0
Steps: 1027
Avg loss: 0.01738
Avg qvals: -1.67359
Episode 409:
Reward: -20.0
Steps: 1202
Avg loss: 0.01787
Avg qvals: -1.71049
Model saved.
Episode 410:
Reward: -20.0
Steps: 1183
Avg loss: 0.01727
Avg qvals: -1.56065
Episode 411:
Reward: -21.0
Steps: 1155
Avg loss: 0.01758
Avg qvals: -1.62897
Episode 412:
Reward: -21.0
Steps: 1224
Avg loss: 0.01845
Avg qvals: -1.55316
Episode 413:
Reward: -20.0
Steps: 1227
Avg loss: 0.01744
Avg qvals: -1.58188
Episode 414:
Reward: -21.0
Steps: 1142
Avg loss: 0.01842
Avg qvals: -1.83179
Episode 415:
Reward: -20.0
Steps: 1164
Avg loss: 0.01784
Avg qvals: -1.52078
Episode 416:
Reward: -19.0
Steps: 1210
Avg loss: 0.01664
Avg qvals: -1.47969
Episode 417:
Reward: -21.0
Steps: 1049
Avg loss: 0.01774
Avg qv

Episode 472:
Reward: -20.0
Steps: 1225
Avg loss: 0.01860
Avg qvals: -1.62356
Episode 473:
Reward: -20.0
Steps: 1200
Avg loss: 0.01601
Avg qvals: -1.42880
Episode 474:
Reward: -21.0
Steps: 1181
Avg loss: 0.01721
Avg qvals: -1.55026
Episode 475:
Reward: -20.0
Steps: 1163
Avg loss: 0.01743
Avg qvals: -1.55295
Episode 476:
Reward: -21.0
Steps: 1025
Avg loss: 0.01860
Avg qvals: -1.75678
Episode 477:
Reward: -20.0
Steps: 1132
Avg loss: 0.01733
Avg qvals: -1.57479
Episode 478:
Reward: -20.0
Steps: 1332
Avg loss: 0.01729
Avg qvals: -1.65608
Episode 479:
Reward: -20.0
Steps: 1279
Avg loss: 0.01796
Avg qvals: -1.75801
Model saved.
Episode 480:
Reward: -21.0
Steps: 1061
Avg loss: 0.01777
Avg qvals: -1.58509
Episode 481:
Reward: -21.0
Steps: 1146
Avg loss: 0.01827
Avg qvals: -1.66157
Episode 482:
Reward: -20.0
Steps: 1133
Avg loss: 0.01697
Avg qvals: -1.60994
Episode 483:
Reward: -20.0
Steps: 1160
Avg loss: 0.01784
Avg qvals: -1.71887
Episode 484:
Reward: -20.0
Steps: 1174
Avg loss: 0.01785
Avg qv

Episode 540:
Reward: -21.0
Steps: 1008
Avg loss: 0.01894
Avg qvals: -1.86918
Episode 541:
Reward: -20.0
Steps: 1116
Avg loss: 0.01713
Avg qvals: -1.52135
Episode 542:
Reward: -20.0
Steps: 1168
Avg loss: 0.01746
Avg qvals: -1.54582
Episode 543:
Reward: -21.0
Steps: 1046
Avg loss: 0.01703
Avg qvals: -1.55187
Episode 544:
Reward: -21.0
Steps: 1108
Avg loss: 0.01688
Avg qvals: -1.53449
Episode 545:
Reward: -20.0
Steps: 1138
Avg loss: 0.01676
Avg qvals: -1.63531
Episode 546:
Reward: -20.0
Steps: 1159
Avg loss: 0.01721
Avg qvals: -1.73066
Episode 547:
Reward: -20.0
Steps: 1116
Avg loss: 0.01773
Avg qvals: -1.63765
Episode 548:
Reward: -18.0
Steps: 1513
Avg loss: 0.01721
Avg qvals: -1.51638
Episode 549:
Reward: -21.0
Steps: 1174
Avg loss: 0.01719
Avg qvals: -1.56676
Model saved.
Episode 550:
Reward: -20.0
Steps: 1192
Avg loss: 0.01594
Avg qvals: -1.46168
Episode 551:
Reward: -20.0
Steps: 1216
Avg loss: 0.01863
Avg qvals: -1.77602
Episode 552:
Reward: -20.0
Steps: 1186
Avg loss: 0.01813
Avg qv

Episode 608:
Reward: -20.0
Steps: 1211
Avg loss: 0.01740
Avg qvals: -1.55346
Episode 609:
Reward: -21.0
Steps: 1072
Avg loss: 0.01711
Avg qvals: -1.65059
Model saved.
Episode 610:
Reward: -21.0
Steps: 1169
Avg loss: 0.01666
Avg qvals: -1.53726
Episode 611:
Reward: -21.0
Steps: 1106
Avg loss: 0.01728
Avg qvals: -1.54913
Episode 612:
Reward: -20.0
Steps: 1167
Avg loss: 0.01702
Avg qvals: -1.53392
Episode 613:
Reward: -20.0
Steps: 1124
Avg loss: 0.01897
Avg qvals: -1.71038
Episode 614:
Reward: -20.0
Steps: 1133
Avg loss: 0.01897
Avg qvals: -1.69055
Episode 615:
Reward: -20.0
Steps: 1165
Avg loss: 0.01703
Avg qvals: -1.49065
Episode 616:
Reward: -21.0
Steps: 1126
Avg loss: 0.01708
Avg qvals: -1.57371
Episode 617:
Reward: -20.0
Steps: 1117
Avg loss: 0.01689
Avg qvals: -1.65468
Episode 618:
Reward: -21.0
Steps: 1026
Avg loss: 0.01687
Avg qvals: -1.56187
Episode 619:
Reward: -21.0
Steps: 1062
Avg loss: 0.01838
Avg qvals: -1.64035
Model saved.
Episode 620:
Reward: -19.0
Steps: 1410
Avg loss: 0

Episode 675:
Reward: -21.0
Steps: 1070
Avg loss: 0.01798
Avg qvals: -1.47466
Episode 676:
Reward: -21.0
Steps: 1201
Avg loss: 0.01786
Avg qvals: -1.44616
Episode 677:
Reward: -20.0
Steps: 1161
Avg loss: 0.01703
Avg qvals: -1.40467
Episode 678:
Reward: -20.0
Steps: 1201
Avg loss: 0.01719
Avg qvals: -1.49762
Episode 679:
Reward: -21.0
Steps: 1015
Avg loss: 0.01848
Avg qvals: -1.54907
Model saved.
Episode 680:
Reward: -21.0
Steps: 1072
Avg loss: 0.01615
Avg qvals: -1.46126
Episode 681:
Reward: -21.0
Steps: 1031
Avg loss: 0.01790
Avg qvals: -1.78459
Episode 682:
Reward: -20.0
Steps: 1273
Avg loss: 0.01855
Avg qvals: -1.71539
Episode 683:
Reward: -19.0
Steps: 1428
Avg loss: 0.01771
Avg qvals: -1.72647
Episode 684:
Reward: -20.0
Steps: 1120
Avg loss: 0.01831
Avg qvals: -1.90290
Episode 685:
Reward: -21.0
Steps: 1028
Avg loss: 0.01788
Avg qvals: -1.53448
Episode 686:
Reward: -21.0
Steps: 1019
Avg loss: 0.01769
Avg qvals: -1.55146
Episode 687:
Reward: -19.0
Steps: 1488
Avg loss: 0.01790
Avg qv

Episode 742:
Reward: -21.0
Steps: 1016
Avg loss: 0.01616
Avg qvals: -1.59388
Episode 743:
Reward: -21.0
Steps: 1006
Avg loss: 0.01829
Avg qvals: -1.71668
Episode 744:
Reward: -21.0
Steps: 1026
Avg loss: 0.01855
Avg qvals: -1.70285
Episode 745:
Reward: -21.0
Steps: 1021
Avg loss: 0.01739
Avg qvals: -1.61068
Episode 746:
Reward: -21.0
Steps: 1019
Avg loss: 0.01808
Avg qvals: -1.72059
Episode 747:
Reward: -21.0
Steps: 1012
Avg loss: 0.01651
Avg qvals: -1.47273
Episode 748:
Reward: -21.0
Steps: 1026
Avg loss: 0.01678
Avg qvals: -1.42543
Episode 749:
Reward: -21.0
Steps: 1018
Avg loss: 0.01659
Avg qvals: -1.63931
Model saved.
Episode 750:
Reward: -21.0
Steps: 1021
Avg loss: 0.01875
Avg qvals: -1.54414
Episode 751:
Reward: -21.0
Steps: 1018
Avg loss: 0.01814
Avg qvals: -1.54810
Episode 752:
Reward: -21.0
Steps: 1016
Avg loss: 0.01806
Avg qvals: -1.52662
Episode 753:
Reward: -21.0
Steps: 1020
Avg loss: 0.01698
Avg qvals: -1.65775
Episode 754:
Reward: -21.0
Steps: 1015
Avg loss: 0.01614
Avg qv

Episode 810:
Reward: -20.0
Steps: 1143
Avg loss: 0.01764
Avg qvals: -1.52223
Episode 811:
Reward: -21.0
Steps: 1027
Avg loss: 0.01858
Avg qvals: -1.61020
Episode 812:
Reward: -21.0
Steps: 1058
Avg loss: 0.01866
Avg qvals: -1.82545
Episode 813:
Reward: -20.0
Steps: 1281
Avg loss: 0.01779
Avg qvals: -1.59912
Episode 814:
Reward: -21.0
Steps: 1047
Avg loss: 0.01784
Avg qvals: -1.60557
Episode 815:
Reward: -20.0
Steps: 1097
Avg loss: 0.01797
Avg qvals: -1.73432
Episode 816:
Reward: -21.0
Steps: 1030
Avg loss: 0.01795
Avg qvals: -1.54169
Episode 817:
Reward: -21.0
Steps: 1146
Avg loss: 0.01742
Avg qvals: -1.57579
Episode 818:
Reward: -21.0
Steps: 1053
Avg loss: 0.01817
Avg qvals: -1.78967
Episode 819:
Reward: -21.0
Steps: 1017
Avg loss: 0.01964
Avg qvals: -1.71724
Model saved.
Episode 820:
Reward: -20.0
Steps: 1118
Avg loss: 0.01763
Avg qvals: -1.64526
Episode 821:
Reward: -20.0
Steps: 1127
Avg loss: 0.01863
Avg qvals: -1.73861
Episode 822:
Reward: -20.0
Steps: 1248
Avg loss: 0.01774
Avg qv

Episode 878:
Reward: -21.0
Steps: 1011
Avg loss: 0.01771
Avg qvals: -1.59725
Episode 879:
Reward: -21.0
Steps: 1021
Avg loss: 0.01857
Avg qvals: -1.50607
Model saved.
Episode 880:
Reward: -21.0
Steps: 1019
Avg loss: 0.01614
Avg qvals: -1.44113
Episode 881:
Reward: -21.0
Steps: 1015
Avg loss: 0.01839
Avg qvals: -1.76491
Episode 882:
Reward: -21.0
Steps: 1026
Avg loss: 0.01816
Avg qvals: -1.69848
Episode 883:
Reward: -21.0
Steps: 1010
Avg loss: 0.01649
Avg qvals: -1.55783
Episode 884:
Reward: -21.0
Steps: 1027
Avg loss: 0.01723
Avg qvals: -1.50333
Episode 885:
Reward: -21.0
Steps: 1024
Avg loss: 0.01826
Avg qvals: -1.81065
Episode 886:
Reward: -21.0
Steps: 1024
Avg loss: 0.01657
Avg qvals: -1.51084
Episode 887:
Reward: -21.0
Steps: 1020
Avg loss: 0.01947
Avg qvals: -1.73527
Episode 888:
Reward: -21.0
Steps: 1031
Avg loss: 0.01992
Avg qvals: -1.91400
Episode 889:
Reward: -21.0
Steps: 1025
Avg loss: 0.02000
Avg qvals: -1.74036
Model saved.
Episode 890:
Reward: -21.0
Steps: 1024
Avg loss: 0

Episode 945:
Reward: -21.0
Steps: 1133
Avg loss: 0.01795
Avg qvals: -1.66337
Episode 946:
Reward: -21.0
Steps: 1031
Avg loss: 0.01794
Avg qvals: -1.53986
Episode 947:
Reward: -21.0
Steps: 1021
Avg loss: 0.01769
Avg qvals: -1.63663
Episode 948:
Reward: -20.0
Steps: 1188
Avg loss: 0.01829
Avg qvals: -1.64623
Episode 949:
Reward: -21.0
Steps: 1064
Avg loss: 0.01726
Avg qvals: -1.37411
Model saved.
Episode 950:
Reward: -20.0
Steps: 1116
Avg loss: 0.01817
Avg qvals: -1.66777
Episode 951:
Reward: -21.0
Steps: 1050
Avg loss: 0.01839
Avg qvals: -1.63709
Episode 952:
Reward: -20.0
Steps: 1123
Avg loss: 0.01962
Avg qvals: -1.70805
Episode 953:
Reward: -21.0
Steps: 1058
Avg loss: 0.01832
Avg qvals: -1.71273
Episode 954:
Reward: -21.0
Steps: 1132
Avg loss: 0.01698
Avg qvals: -1.51854
Episode 955:
Reward: -21.0
Steps: 1052
Avg loss: 0.01731
Avg qvals: -1.67418
Episode 956:
Reward: -21.0
Steps: 1056
Avg loss: 0.01682
Avg qvals: -1.52519
Episode 957:
Reward: -20.0
Steps: 1178
Avg loss: 0.01941
Avg qv

Episode 1012:
Reward: -21.0
Steps: 1042
Avg loss: 0.01935
Avg qvals: -1.75156
Episode 1013:
Reward: -20.0
Steps: 1121
Avg loss: 0.02085
Avg qvals: -1.91180
Episode 1014:
Reward: -21.0
Steps: 1021
Avg loss: 0.01763
Avg qvals: -1.60124
Episode 1015:
Reward: -20.0
Steps: 1203
Avg loss: 0.01817
Avg qvals: -1.71300
Episode 1016:
Reward: -21.0
Steps: 1165
Avg loss: 0.01783
Avg qvals: -1.70365
Episode 1017:
Reward: -21.0
Steps: 1005
Avg loss: 0.01842
Avg qvals: -1.83917
Episode 1018:
Reward: -21.0
Steps: 1130
Avg loss: 0.01879
Avg qvals: -1.72760
Episode 1019:
Reward: -21.0
Steps: 1067
Avg loss: 0.01823
Avg qvals: -1.70728
Model saved.
Episode 1020:
Reward: -21.0
Steps: 1172
Avg loss: 0.01671
Avg qvals: -1.56387
Episode 1021:
Reward: -21.0
Steps: 1081
Avg loss: 0.01783
Avg qvals: -1.71465
Episode 1022:
Reward: -20.0
Steps: 1296
Avg loss: 0.01784
Avg qvals: -1.65424
Episode 1023:
Reward: -20.0
Steps: 1130
Avg loss: 0.01751
Avg qvals: -1.58262
Episode 1024:
Reward: -21.0
Steps: 1103
Avg loss: 0

Episode 1079:
Reward: -21.0
Steps: 1012
Avg loss: 0.01699
Avg qvals: -1.56555
Model saved.
Episode 1080:
Reward: -21.0
Steps: 1092
Avg loss: 0.01830
Avg qvals: -1.59929
Episode 1081:
Reward: -21.0
Steps: 1023
Avg loss: 0.01726
Avg qvals: -1.76158
Episode 1082:
Reward: -21.0
Steps: 1174
Avg loss: 0.01927
Avg qvals: -1.60310
Episode 1083:
Reward: -20.0
Steps: 1260
Avg loss: 0.01793
Avg qvals: -1.78146
Episode 1084:
Reward: -21.0
Steps: 1008
Avg loss: 0.01909
Avg qvals: -1.68068
Episode 1085:
Reward: -20.0
Steps: 1235
Avg loss: 0.01809
Avg qvals: -1.67443
Episode 1086:
Reward: -20.0
Steps: 1126
Avg loss: 0.01715
Avg qvals: -1.47094
Episode 1087:
Reward: -21.0
Steps: 1026
Avg loss: 0.01795
Avg qvals: -1.66529
Episode 1088:
Reward: -20.0
Steps: 1122
Avg loss: 0.01811
Avg qvals: -1.67736
Episode 1089:
Reward: -21.0
Steps: 1186
Avg loss: 0.01855
Avg qvals: -1.55053
Model saved.
Episode 1090:
Reward: -21.0
Steps: 1012
Avg loss: 0.01610
Avg qvals: -1.50710
Episode 1091:
Reward: -21.0
Steps: 103

Episode 1146:
Reward: -19.0
Steps: 1286
Avg loss: 0.01678
Avg qvals: -1.62470
Episode 1147:
Reward: -21.0
Steps: 1157
Avg loss: 0.01873
Avg qvals: -1.50555
Episode 1148:
Reward: -21.0
Steps: 1097
Avg loss: 0.01766
Avg qvals: -1.65822
Episode 1149:
Reward: -21.0
Steps: 1077
Avg loss: 0.01892
Avg qvals: -1.84310
Model saved.
Episode 1150:
Reward: -21.0
Steps: 1186
Avg loss: 0.01689
Avg qvals: -1.48789
Episode 1151:
Reward: -21.0
Steps: 1083
Avg loss: 0.01942
Avg qvals: -1.75080
Episode 1152:
Reward: -20.0
Steps: 1163
Avg loss: 0.01859
Avg qvals: -1.75351
Episode 1153:
Reward: -20.0
Steps: 1157
Avg loss: 0.01832
Avg qvals: -1.80610
Episode 1154:
Reward: -21.0
Steps: 1101
Avg loss: 0.01771
Avg qvals: -1.54438
Episode 1155:
Reward: -21.0
Steps: 1042
Avg loss: 0.01875
Avg qvals: -1.76561
Episode 1156:
Reward: -21.0
Steps: 1038
Avg loss: 0.01871
Avg qvals: -1.59188
Episode 1157:
Reward: -21.0
Steps: 1025
Avg loss: 0.01841
Avg qvals: -1.80421
Episode 1158:
Reward: -20.0
Steps: 1123
Avg loss: 0

Episode 1213:
Reward: -19.0
Steps: 1274
Avg loss: 0.01716
Avg qvals: -1.67258
Episode 1214:
Reward: -21.0
Steps: 1110
Avg loss: 0.01785
Avg qvals: -1.54365
Episode 1215:
Reward: -20.0
Steps: 1237
Avg loss: 0.01876
Avg qvals: -1.69830
Episode 1216:
Reward: -20.0
Steps: 1165
Avg loss: 0.01876
Avg qvals: -1.76362
Episode 1217:
Reward: -20.0
Steps: 1202
Avg loss: 0.01951
Avg qvals: -1.86966
Episode 1218:
Reward: -21.0
Steps: 1010
Avg loss: 0.01869
Avg qvals: -1.74902
Episode 1219:
Reward: -21.0
Steps: 1025
Avg loss: 0.01721
Avg qvals: -1.68789
Model saved.
Episode 1220:
Reward: -21.0
Steps: 1007
Avg loss: 0.01904
Avg qvals: -1.80555
Episode 1221:
Reward: -21.0
Steps: 1096
Avg loss: 0.01928
Avg qvals: -1.68514
Episode 1222:
Reward: -21.0
Steps: 1007
Avg loss: 0.01762
Avg qvals: -1.56899
Episode 1223:
Reward: -21.0
Steps: 1001
Avg loss: 0.01931
Avg qvals: -1.81585
Episode 1224:
Reward: -20.0
Steps: 1197
Avg loss: 0.01753
Avg qvals: -1.71453
Episode 1225:
Reward: -21.0
Steps: 1006
Avg loss: 0

Episode 1280:
Reward: -21.0
Steps: 1145
Avg loss: 0.01978
Avg qvals: -1.78576
Episode 1281:
Reward: -21.0
Steps: 1014
Avg loss: 0.01893
Avg qvals: -1.64835
Episode 1282:
Reward: -21.0
Steps: 1062
Avg loss: 0.01806
Avg qvals: -1.76483
Episode 1283:
Reward: -21.0
Steps: 1049
Avg loss: 0.01805
Avg qvals: -1.50967
Episode 1284:
Reward: -18.0
Steps: 1339
Avg loss: 0.01904
Avg qvals: -1.93343
Episode 1285:
Reward: -20.0
Steps: 1161
Avg loss: 0.01750
Avg qvals: -1.54005
Episode 1286:
Reward: -21.0
Steps: 1085
Avg loss: 0.01711
Avg qvals: -1.46661
Episode 1287:
Reward: -21.0
Steps: 1094
Avg loss: 0.02015
Avg qvals: -1.85831
Episode 1288:
Reward: -21.0
Steps: 1017
Avg loss: 0.01912
Avg qvals: -1.62444
Episode 1289:
Reward: -18.0
Steps: 1401
Avg loss: 0.01786
Avg qvals: -1.73574
Model saved.
Episode 1290:
Reward: -21.0
Steps: 1127
Avg loss: 0.01768
Avg qvals: -1.57248
Episode 1291:
Reward: -20.0
Steps: 1187
Avg loss: 0.01981
Avg qvals: -1.80797
Episode 1292:
Reward: -21.0
Steps: 1051
Avg loss: 0

Episode 1347:
Reward: -20.0
Steps: 1164
Avg loss: 0.01865
Avg qvals: -1.68758
Episode 1348:
Reward: -20.0
Steps: 1141
Avg loss: 0.01820
Avg qvals: -1.59451
Episode 1349:
Reward: -20.0
Steps: 1150
Avg loss: 0.01889
Avg qvals: -1.83157
Model saved.
Episode 1350:
Reward: -21.0
Steps: 1024
Avg loss: 0.01750
Avg qvals: -1.51452
Episode 1351:
Reward: -21.0
Steps: 1005
Avg loss: 0.01875
Avg qvals: -1.73966
Episode 1352:
Reward: -21.0
Steps: 1164
Avg loss: 0.01827
Avg qvals: -1.46176
Episode 1353:
Reward: -20.0
Steps: 1107
Avg loss: 0.01815
Avg qvals: -1.65653
Episode 1354:
Reward: -21.0
Steps: 1083
Avg loss: 0.01901
Avg qvals: -1.78708
Episode 1355:
Reward: -21.0
Steps: 1060
Avg loss: 0.01831
Avg qvals: -1.82218
Episode 1356:
Reward: -20.0
Steps: 1127
Avg loss: 0.01969
Avg qvals: -1.85580
Episode 1357:
Reward: -21.0
Steps: 1017
Avg loss: 0.01909
Avg qvals: -1.91006
Episode 1358:
Reward: -21.0
Steps: 1043
Avg loss: 0.01797
Avg qvals: -1.80359
Episode 1359:
Reward: -21.0
Steps: 1065
Avg loss: 0

Episode 1414:
Reward: -20.0
Steps: 1143
Avg loss: 0.01914
Avg qvals: -1.75235
Episode 1415:
Reward: -20.0
Steps: 1159
Avg loss: 0.01798
Avg qvals: -1.65576
Episode 1416:
Reward: -21.0
Steps: 1092
Avg loss: 0.01962
Avg qvals: -1.87114
Episode 1417:
Reward: -21.0
Steps: 1011
Avg loss: 0.02019
Avg qvals: -1.81749
Episode 1418:
Reward: -21.0
Steps: 1028
Avg loss: 0.01809
Avg qvals: -1.66692
Episode 1419:
Reward: -21.0
Steps: 1136
Avg loss: 0.01787
Avg qvals: -1.62437
Model saved.
Episode 1420:
Reward: -21.0
Steps: 1049
Avg loss: 0.01997
Avg qvals: -1.74695
Episode 1421:
Reward: -21.0
Steps: 1026
Avg loss: 0.02045
Avg qvals: -1.83599
Episode 1422:
Reward: -21.0
Steps: 1017
Avg loss: 0.01780
Avg qvals: -1.55712
Episode 1423:
Reward: -21.0
Steps: 1178
Avg loss: 0.01861
Avg qvals: -1.73912
Episode 1424:
Reward: -21.0
Steps: 1009
Avg loss: 0.01852
Avg qvals: -1.75277
Episode 1425:
Reward: -20.0
Steps: 1135
Avg loss: 0.01865
Avg qvals: -1.89768
Episode 1426:
Reward: -20.0
Steps: 1278
Avg loss: 0

Episode 1481:
Reward: -21.0
Steps: 1019
Avg loss: 0.01914
Avg qvals: -1.76436
Episode 1482:
Reward: -21.0
Steps: 1041
Avg loss: 0.01741
Avg qvals: -1.53654
Episode 1483:
Reward: -21.0
Steps: 1004
Avg loss: 0.01851
Avg qvals: -1.67582
Episode 1484:
Reward: -21.0
Steps: 1021
Avg loss: 0.01854
Avg qvals: -1.56474
Episode 1485:
Reward: -21.0
Steps: 1020
Avg loss: 0.01874
Avg qvals: -2.02387
Episode 1486:
Reward: -21.0
Steps: 1016
Avg loss: 0.01871
Avg qvals: -1.69595
Episode 1487:
Reward: -21.0
Steps: 1020
Avg loss: 0.01778
Avg qvals: -1.52573
Episode 1488:
Reward: -21.0
Steps: 1015
Avg loss: 0.01822
Avg qvals: -1.61536
Episode 1489:
Reward: -21.0
Steps: 1023
Avg loss: 0.01770
Avg qvals: -1.68665
Model saved.
Episode 1490:
Reward: -21.0
Steps: 1012
Avg loss: 0.01911
Avg qvals: -1.66508
Episode 1491:
Reward: -21.0
Steps: 1024
Avg loss: 0.01904
Avg qvals: -1.75541
Episode 1492:
Reward: -21.0
Steps: 1029
Avg loss: 0.01909
Avg qvals: -1.54582
Episode 1493:
Reward: -21.0
Steps: 1020
Avg loss: 0

KeyboardInterrupt: 

## Manually Go Through One Iteration

In [None]:
import numpy as np
import torch

import gym
from src import agent, dqn, train, utils

#### Create Gym Environment

In [None]:
env = gym.envs.make('Pong-v4')

In [None]:
init_obs = env.reset()

for i in range(20000):
    a = random.randrange(env.action_space.n)
    _, _, done, _ = env.step(a)
    time.sleep(.01)
    env.render()
    
    if done:
        env.reset()

In [None]:
init_obs = env.reset()
init_obs.shape

In [None]:
s1, r, done, _ = env.step(0)

In [None]:
s1.shape

#### Initialize DQN objects

In [None]:
replay_mem_size = int(1e6)
mini_batch_size = 32

In [None]:
agt = agent.DQNAgent()
replay_memory = utils.ReplayMemory(replay_mem_size, mini_batch_size)
obs_history = utils.ObsHistory()

#### Begin new episode

In [None]:
obs_init = env.reset()  # reset environment to start new episode
obs_history.reset(obs_init)  # reset observations for new episode
done = False

#### Choose action

In [None]:
phi = obs_history.phi
a = agt.act(phi)
obs, rew, done, _ = env.step(a)
obs_history.store(obs)

#### Store Transition

In [None]:
phi_1 = obs_history.phi
replay_memory.store((phi, a, rew, phi_1, done))

#### Gradient Step

[x] Dummy transitions function for testing.

[x] Make `r` reward vector from transitions.

[x] Make $\max_{a}Q(s', a')$ vector from transitions.

[x] Make `y` target vector from transitions.

In [None]:
from collections import namedtuple
import torch
from torch import nn
import numpy as np

Transition = namedtuple('Transition', 
                        ['phi', 'a', 'r', 'phi_1', 'done'])

In [None]:
def dummy_transitions(n):
    transitions = []
    
    for i in range(n):
        phi = torch.empty(4, 84, 84).random_(0, 255)
        phi_1 = torch.empty(4, 84, 84).random_(0, 255)
        a = np.random.randint(0, 6)
        r = np.random.randint(0, 2)
        done = False if np.random.randint(0, 2) == 0 else True
        
        transitions.append(Transition(phi, a, r, phi_1, done))
    
    return transitions

In [None]:
transitions = dummy_transitions(3)
    
phi, a, r, phi_1, done = zip(*transitions)

In [None]:
def make_y(transitions, agt):
    y = []
    
    for tr in transitions:
        if tr.done:
            y.append(tr.r)
        else:
            x = tr.phi.unsqueeze(0)
            y.append(tr.r + .99 * agt.get_best_values(x).item())
            
    return torch.Tensor(y)

In [None]:
y = make_y(transitions, agt)

In [None]:
def get_max_vals(transitions, agt):
    phis = []

    for tr in transitions:
        phis.append(tr.phi)

    x = torch.stack(phis)
    return agt.get_best_values(x)

In [None]:
qmax = get_max_vals(transitions, agt)

In [None]:
loss = nn.MSELoss(reduction='mean')

In [None]:
loss(y, qmax)

In [None]:
def mini_batch_loss(transitions, agt):
    y = make_y(transitions, agt)
    qmax = get_max_vals(transitions, agt)
    
    loss = nn.MSELoss(reduction='mean')
    return loss(y, qmax)

In [None]:
mb_loss = mini_batch_loss(transitions, agt)

#### Optimizer and gradient step

In [None]:
from torch import optim

optimizer = optim.RMSprop(agt.qnet.parameters())

In [None]:
optimizer.zero_grad()

In [None]:
mb_loss = mini_batch_loss(transitions, agt)
mb_loss

In [None]:
mb_loss.backward()

In [None]:
optimizer.step()

### Test random agent on Pong

In [None]:
s0 = env.reset()

In [None]:
obss = []
obss.append(env.reset())

for _ in range(1000):
    a = np.random.choice(env.action_space.n)
    obs, rew, done, _ = env.step(a)
    obss.append(obs)
    
    if done:
        obs = env.reset()

In [None]:
env.action_space.n