In [1]:
# !python -m pip install "gymnasium[atari]"
# !python -m pip install "gymnasium[accept-rom-license, atari]"
# !pip install shimmy

In [2]:
import matplotlib.pyplot as plt
import gymnasium as gym
import seaborn as sns
import numpy as np

import warnings
import ale_py
import shimmy
import joblib
import os

from gym import wrappers

| **Value** | **Meaning** |
|:---------:|:-----------:|
| 0 | NOOP |
| 1 | FIRE |
| 2 | RIGHT |
| 3 | LEFT |
| 4 | RIGHTFIRE |
| 5 | LEFTFIRE |

# General Functions

In [3]:
def show_obs(obs):
    """ 
    Simple display of image observation 
    
    Args:
    `obs` : np.ndarray
    - Observation from the environment
    """
    plt.figure(figsize=(16,10))
    plt.imshow(obs)
    plt.show()
    return

# Policy Functions

In [4]:
def discount_rewards(rewards, gamma=.99):
    """ 
    Take 1D array of rewards and compute discounted version
    Most recent action has the greatest weight 
    
    Args:
    `rewards` : np.ndarray
    - Observed rewards over time
    - ndim : 1
    """
    discounted_rewards = torch.zeros(len(rewards)).double()
    running_add = 0
    for t in reversed(range(0, len(rewards))):
        running_add = running_add * gamma + rewards[t]
        discounted_rewards[t] = running_add
    return discounted_rewards

# Run Constants

In [5]:
# Config flags - video output and res
resume = True # resume training from previous checkpoint (from save.p  file)?
render = False # render video output?
print_ = False # print each observation
show = False
no_grad = False
corner_correct = True

timer_i = 1000 # number of iterations without reward before noise is intentionally greater than signal

record_probs = True
record_rewards = True
record_eps_iters = True
save_path = 'model.pt'

# Model Instantiation

In [6]:
OBS_SHAPE = (210, 160)
XMIN = 26
XMAX = 196
YMIN = 14
YMAX = 144
SHAPE = (XMAX - XMIN, YMAX - YMIN)
DOWNSAMPLE = False
DIM = np.prod(SHAPE) // 4 if DOWNSAMPLE else np.prod(SHAPE)

action_dict = {
    0 : 'NOOP',
    1 : 'FIRE',
    2 : 'RIGHT',
    3 : 'LEFT',
    4 : 'RIGHTFIRE',
    5 : 'LEFTFIRE'
}
ACTIONS = [0,1,2,3,4,5] # modify to limit available actions
N_CLASSES = len(ACTIONS)

print('Input Shape:', SHAPE)
print('Input Dimensionality:', DIM)

def preprocess(obs, downsample=True, xmin=26, xmax=196, ymin=10, ymax=144):
    assert obs.shape == (210, 160)
    I = obs[xmin:xmax,ymin:ymax] # crop - remove 35px from start & 25px from end of image in x, to reduce redundant parts of image (i.e. after ball passes paddle)
    if downsample:
        I = I[::2,:]
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else to 1
    return I.astype(np.float32).ravel() # ravel flattens an array and collapses it into a column vector

Input Shape: (170, 130)
Input Dimensionality: 22100


In [7]:
prev_x = None # used in computing the difference frame
# xs, hs, dlogps, drs = list(), list(), list(), list()
dlogps, drs = list(), list()
running_reward = None

reward_sum = 0
adj_reward_sum = 0

In [8]:
def add_noise(probs, i, i_since_r, timer_i, buffer=None, print_=False):
    if buffer is None:
        buffer = timer_i // 2
    n = len(probs)
    sigma = 2 / n
    noise = np.random.normal(0, sigma, size=n)
    noise = noise - np.mean(noise)
    
    scale = i_since_r / (timer_i - buffer)
    noise = noise * scale
    assert not round(np.mean(noise), 3), noise
    if print_ and not i % 100:
        print(probs)
        print(noise)
    new_probs = probs.detach().numpy() + noise
    pmin = np.amin(new_probs)
    if pmin < 0:
        new_probs -= pmin
        new_probs /= np.sum(new_probs)
    new_probs = torch.from_numpy(new_probs)
    return new_probs

def balance_lr(probs, i_since_r, timer_i, buffer=None):
#    ACTIONS = [NOOP,1,2,3,4,5]
    if i_since_r < timer_i // 4:
        pass
    elif i_since_r < timer_i // 2:
        equal_n = (probs[2] + probs[3]) / 2
        equal_y = (probs[4] + probs[5]) / 2
        probs[2] = equal_n
        probs[3] = equal_n
        probs[4] = equal_y
        probs[5] = equal_y
    elif i_since_r < 3 * timer_i // 4:
        probs[2], probs[3] = probs[3], probs[2]
        probs[4], probs[5] = probs[5], probs[4]
    return probs
    
def modify_reward(action, reward, info, prev_lives):
    if info['lives'] < prev_lives:
        reward -= 15
    if reward <= 0 and action in [1,4,5]:
        reward -= 1
    return reward

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

class AtariReward(nn.Module):
    def __init__(self):
        super(AtariReward, self).__init__()

    def forward(self, reward):
        if isinstance(reward, np.ndarray):
            reward = torch.from_numpy(reward)
        loss = -reward       
        return loss

class TwoLayerReinforcement(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerReinforcement, self).__init__()
        
        self.log_probss = list()
        self.rewards = list()
        
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        return
        
    def forward(self, x):
        x = torch.from_numpy(x).double()
        x = self.layer1(x)
        x = self.activation(x)
        x = self.layer2(x)
#         probs = self.softmax(x)
        print(x)
        probs = torch.nn.functional.softmax(x, dim=0)
        print(probs)
        return probs

In [13]:

n_episodes = 100
last_i = 0
env = gym.make(
    'ALE/DemonAttack-v5', # alternate games can be chosen here 
    obs_type='grayscale', # saves RGB preprocessing reduction
    render_mode='human' if render else None, # rendering shows popup but limits training speed
)
        
if record_rewards:
    reward_list = list()
if record_probs:
    prob_list = list()
if record_eps_iters:
    eps_iters_list = list()

model = TwoLayerReinforcement(DIM, 64, 6).double()
criterion = AtariReward()
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

def process_probs(probs, i, last_i, timer_i=1000, corner_correct=True):
    initial_shape = probs.shape
    i_since_r = i - last_i
    if i_since_r > timer_i:
        terminated = True
        truncated = False
        print('Timer causing reset               ')
    else:
        terminated = False
        truncated = False
        
    if corner_correct: # heavily biases agent from getting 'stuck' in corner
        probs = add_noise(probs, i, i_since_r, timer_i)
        probs = balance_lr(probs, i_since_r, timer_i)

    if torch.round(torch.sum(probs), decimals=4) != 1:
        warnings.warn(str(probs) + ' | ' + str(torch.sum(probs)) + ' != 1')
        probs /= torch.sum(probs)
        
    assert probs.shape == initial_shape
    return probs, i, terminated, truncated

eps = np.finfo(np.float32).eps.item()
episode_number = 0
prev_lives = 0
i = 0
obs, info = env.reset()
while episode_number <= n_episodes:
    curr_x = preprocess(obs, downsample=DOWNSAMPLE, xmin=XMIN, xmax=XMAX, ymin=YMIN, ymax=YMAX)
    x = curr_x - prev_x if prev_x is not None else np.zeros(DIM) # only monitor change between frames
#     xs.append(x)

    prev_x = curr_x

    model_probs = model(x) # autograd performed here
    probs, last_i, terminated, truncated = process_probs(model_probs, i, last_i, timer_i=timer_i, corner_correct=corner_correct)
    
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_probs = m.log_prob(action)
    model.log_probss.append(log_probs)
    action = action.item()
    
    ######################################################################################
    # fold this into model
    # fix log probs here
    # https://github.com/pytorch/examples/blob/main/reinforcement_learning/reinforce.py
    # log_prob = prob.clamp(min=1e-6).log()
    # entropy = - (probs * probs.clamp(min=1e-6).log()).sum()
    # https://discuss.pytorch.org/t/policy-gradient-using-loss-as-reward/13877
    ######################################################################################
    
    prev_lives = info['lives'] # lives not available through general step return
    obs, reward, terminated, truncated, info = env.step(action) # step returns all other relevant information 
    if reward > 0: # reset the iterations since last reward if reward is accrued
        last_i = i

    reward_sum += reward # total round reward incremented
    adj_reward = modify_reward(action, reward, info, prev_lives) # adjusted reward may better lead agent toward short term optimums
    adj_reward_sum += adj_reward

    model.rewards.append(adj_reward)
    print(len(model.log_probss))
    print(len(model.rewards))
    ######################################################

    if terminated: # an episode finished
        episode_number += 1
#         print(f'Episode: {episode_number}              ')

        if record_rewards:
            reward_list.append(reward_sum)
        if record_eps_iters:
            eps_iters_list.append(i)
        
        # stack together all inputs, hidden states, action gradients, and rewards for this episode
        if not no_grad:
            print('$$', model.log_probss)
            print('%%', model.rewards)

#             epdlogp = -torch.cat(model.log_probss, dim=0)
#             epr = torch.cat(model.rewards, dim=0)

#             discounted_epr = standardize(discount_rewards(epr))
            def standardize(returns):
                returns -= returns.mean()
                returns /= (returns.std() + eps)
                return returns

            returns = torch.tensor(discount_rewards(model.rewards))
            returns = standardize(returns)
            episode_losss = [(-log_prob * r).unsqueeze(dim=0) for log_prob, r in zip(model.log_probss, returns)]
            print(episode_losss)
            episode_losss = torch.cat(episode_losss)
    
            optimizer.zero_grad()
            loss = episode_losss.sum()
            loss.backward()
            optimizer.step()
            
            model.log_probss.clear()
            model.rewards.clear()

        reward_sum = 0 # reset all totals
        adj_reward_sum = 0
        
        obs, info = env.reset() # reset env
        prev_x = None
    elif truncated: # an episode terminated unexpectedly, shouldn't maintain results
        model.log_probss.clear()
        model.rewards.clear()
        
        reward_sum = 0
        adj_reward_sum = 0
        
        obs, info = env.reset()
        prev_x = None
    

        
    if not i % 100:
        print(f'Episode {episode_number} of {n_episodes} episodes                ', end='\r')
        torch.save(model, save_path)
    i += 1

env.close()
prob_list = model.log_probss

tensor([-0.1026,  0.1316,  0.0839,  0.0355,  0.0153,  0.0877],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1439, 0.1818, 0.1733, 0.1651, 0.1619, 0.1740], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1
1
Episode 0 of 100 episodes                tensor([-0.0988,  0.1389,  0.0636,  0.0234,  0.0628,  0.0795],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1440, 0.1826, 0.1694, 0.1627, 0.1692, 0.1721], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
2
2
tensor([-0.1066,  0.1261,  0.0540,  0.0224,  0.0720,  0.0877],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1432, 0.1807, 0.1681, 0.1629, 0.1712, 0.1739], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
3
3
tensor([-0.0949,  0.1326,  0.0547,  0.0230,  0.0710,  0.0890],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1444, 0.1813, 0.1677, 0.1625, 0.1705, 0.1736], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
4
4
tensor([-0.0925,  0.1304,  0.0

70
70
tensor([-0.0839,  0.1359,  0.0506,  0.0239,  0.0676,  0.0975],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1456, 0.1815, 0.1666, 0.1622, 0.1695, 0.1746], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
71
71
tensor([-0.1085,  0.1519,  0.0689,  0.0434,  0.0569,  0.0790],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1420, 0.1843, 0.1696, 0.1653, 0.1676, 0.1713], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
72
72
tensor([-0.0824,  0.1169,  0.0465,  0.0287,  0.0607,  0.1279],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1457, 0.1778, 0.1657, 0.1628, 0.1681, 0.1798], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
73
73
tensor([-0.0979,  0.1405,  0.0517,  0.0654,  0.0546,  0.0935],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1432, 0.1817, 0.1663, 0.1686, 0.1668, 0.1734], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
74
74
tensor([-0.1225,  0.1431,  0.0820,  0.0213,  0.0421,  0.07

tensor([-0.1131,  0.1209,  0.0488,  0.0293,  0.0756,  0.0611],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1431, 0.1808, 0.1682, 0.1650, 0.1728, 0.1703], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
140
140
tensor([-0.0732,  0.1475,  0.0364, -0.0024,  0.0566,  0.0943],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1480, 0.1845, 0.1651, 0.1588, 0.1685, 0.1750], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
141
141
tensor([-0.0790,  0.1511,  0.0437,  0.0334,  0.0791,  0.1299],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1447, 0.1821, 0.1636, 0.1619, 0.1695, 0.1783], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
142
142
tensor([-0.0989,  0.1118,  0.0678,  0.0449,  0.0750,  0.1012],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1432, 0.1768, 0.1692, 0.1654, 0.1704, 0.1749], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
143
143
tensor([-0.0906,  0.1352,  0.0584, -0.0067,  0.0513,  0.

208
208
tensor([-0.1025,  0.1421,  0.0748,  0.0231,  0.0926,  0.0712],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1427, 0.1822, 0.1703, 0.1617, 0.1734, 0.1697], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
209
209
tensor([-0.0907,  0.1106,  0.0384,  0.0236,  0.0520,  0.0886],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1464, 0.1790, 0.1666, 0.1641, 0.1688, 0.1751], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
210
210
tensor([-0.1092,  0.1395,  0.0683,  0.0156,  0.0418,  0.0775],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1433, 0.1838, 0.1711, 0.1624, 0.1667, 0.1727], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
211
211
tensor([-0.0697,  0.1008,  0.0492,  0.0215,  0.0718,  0.1065],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1481, 0.1756, 0.1668, 0.1622, 0.1706, 0.1766], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
212
212
tensor([-0.0999,  0.1347,  0.0455,  0.0346,  0.0

tensor([-0.0962,  0.1290,  0.0837,  0.0417,  0.0404,  0.1001],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1437, 0.1799, 0.1720, 0.1649, 0.1647, 0.1748], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
281
281
tensor([-0.1050,  0.1346,  0.0651,  0.0058,  0.0716,  0.0942],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1431, 0.1819, 0.1697, 0.1599, 0.1708, 0.1747], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
282
282
tensor([-0.1028,  0.1097,  0.0485,  0.0284,  0.0362,  0.0706],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1454, 0.1798, 0.1691, 0.1658, 0.1670, 0.1729], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
283
283
tensor([-0.1043,  0.1285,  0.0559,  0.0349,  0.0763,  0.0837],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1431, 0.1806, 0.1679, 0.1644, 0.1714, 0.1727], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
284
284
tensor([-0.1080,  0.1344,  0.0533,  0.0177,  0.0494,  0.

tensor([-0.0781,  0.1405,  0.0629,  0.0171,  0.0743,  0.1146],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1455, 0.1811, 0.1675, 0.1600, 0.1695, 0.1764], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
345
345
tensor([-0.1089,  0.1223,  0.0619,  0.0173,  0.0667,  0.0695],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1435, 0.1808, 0.1702, 0.1628, 0.1711, 0.1715], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
346
346
tensor([-0.1191,  0.1304,  0.0527,  0.0226,  0.0582,  0.1042],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1415, 0.1816, 0.1680, 0.1630, 0.1689, 0.1769], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
347
347
tensor([-0.0943,  0.1456,  0.0437,  0.0351,  0.0691,  0.0938],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1440, 0.1831, 0.1654, 0.1640, 0.1696, 0.1739], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
348
348
tensor([-0.0881,  0.1389,  0.0781,  0.0156,  0.0698,  0.

409
409
tensor([-0.0994,  0.1375,  0.0606,  0.0185,  0.0516,  0.1156],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1435, 0.1819, 0.1684, 0.1614, 0.1669, 0.1779], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
410
410
tensor([-0.0794,  0.1239,  0.0746,  0.0245,  0.0570,  0.0788],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1466, 0.1797, 0.1711, 0.1627, 0.1681, 0.1718], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
411
411
tensor([-0.0844,  0.1593,  0.0271,  0.0351,  0.0485,  0.0818],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1461, 0.1864, 0.1634, 0.1647, 0.1669, 0.1725], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
412
412
tensor([-0.0730,  0.1371,  0.0763,  0.0251,  0.0643,  0.0731],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1470, 0.1814, 0.1707, 0.1622, 0.1686, 0.1701], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
413
413
tensor([-0.1120,  0.1442,  0.0456,  0.0359,  0.0

483
483
tensor([-0.1047,  0.1360,  0.0476,  0.0273,  0.0566,  0.0769],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1438, 0.1830, 0.1675, 0.1641, 0.1690, 0.1725], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
484
484
tensor([-0.0899,  0.1425,  0.0845,  0.0312,  0.0413,  0.0967],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1444, 0.1822, 0.1719, 0.1630, 0.1646, 0.1740], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
485
485
tensor([-0.1102,  0.1239,  0.0362,  0.0271,  0.0611,  0.0693],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1438, 0.1818, 0.1665, 0.1650, 0.1707, 0.1721], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
486
486
tensor([-0.0861,  0.1378,  0.0629,  0.0252,  0.0600,  0.0899],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1454, 0.1818, 0.1687, 0.1625, 0.1682, 0.1733], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
487
487
tensor([-0.0758,  0.1532,  0.0629,  0.0207,  0.0

tensor([-0.0633,  0.1349,  0.0861,  0.0271,  0.0620,  0.0870],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1477, 0.1801, 0.1715, 0.1617, 0.1674, 0.1716], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
547
547
tensor([-0.0964,  0.1331,  0.0502,  0.0316,  0.0446,  0.0887],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1448, 0.1821, 0.1676, 0.1645, 0.1667, 0.1742], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
548
548
tensor([-0.1046,  0.1542,  0.0737,  0.0348,  0.0898,  0.1136],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1409, 0.1825, 0.1684, 0.1619, 0.1711, 0.1752], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
549
549
tensor([-0.1056,  0.1331,  0.0746,  0.0030,  0.0516,  0.0917],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1435, 0.1821, 0.1718, 0.1599, 0.1679, 0.1748], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
550
550
tensor([-0.0915,  0.1475,  0.0369,  0.0727,  0.0603,  0.

tensor([-0.0936,  0.1279,  0.0709,  0.0240,  0.0628,  0.1050],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1441, 0.1798, 0.1698, 0.1621, 0.1685, 0.1757], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
607
607
tensor([-0.1344,  0.1499,  0.0533,  0.0252,  0.0482,  0.0924],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1396, 0.1855, 0.1684, 0.1638, 0.1676, 0.1751], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
608
608
tensor([-0.0821,  0.1423,  0.0684,  0.0388,  0.0690,  0.0841],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1452, 0.1817, 0.1688, 0.1639, 0.1689, 0.1715], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
609
609
tensor([-0.0706,  0.1504,  0.0485,  0.0141,  0.0719,  0.1178],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1466, 0.1828, 0.1651, 0.1595, 0.1690, 0.1769], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
610
610
tensor([-0.0947,  0.1173,  0.0635,  0.0371,  0.0813,  0.

tensor([-0.0886,  0.1564,  0.0797,  0.0394,  0.0630,  0.1152],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1431, 0.1829, 0.1694, 0.1627, 0.1665, 0.1755], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
662
662
tensor([-0.1265,  0.1363,  0.0383,  0.0266,  0.0373,  0.0866],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1416, 0.1842, 0.1670, 0.1651, 0.1668, 0.1753], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
663
663
tensor([-0.0873,  0.1390,  0.0704,  0.0721,  0.0770,  0.0906],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1434, 0.1799, 0.1680, 0.1682, 0.1691, 0.1714], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
664
664
tensor([-0.1128,  0.1387,  0.0406,  0.0391,  0.0421,  0.0786],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1430, 0.1839, 0.1667, 0.1664, 0.1669, 0.1731], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
665
665
tensor([-0.1037,  0.1566,  0.0400,  0.0384,  0.0611,  0.

tensor([-0.0923,  0.1473,  0.1050,  0.0157,  0.0472,  0.1157],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1432, 0.1820, 0.1744, 0.1595, 0.1646, 0.1763], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
710
710
tensor([-0.0737,  0.1180,  0.0371,  0.0546,  0.0362,  0.0753],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1483, 0.1797, 0.1657, 0.1686, 0.1655, 0.1722], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
711
711
tensor([-0.0967,  0.1455,  0.0407,  0.0535,  0.0851,  0.0946],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1430, 0.1822, 0.1640, 0.1662, 0.1715, 0.1731], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
712
712
tensor([-0.1145,  0.1087,  0.0353,  0.0051,  0.0650,  0.0829],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1438, 0.1798, 0.1670, 0.1621, 0.1721, 0.1752], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
713
713
tensor([-0.0716,  0.1540,  0.0621,  0.0474,  0.0761,  0.

777
777
tensor([-0.0963,  0.1200,  0.0766,  0.0032,  0.0492,  0.1004],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1447, 0.1797, 0.1721, 0.1599, 0.1674, 0.1762], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
778
778
tensor([-0.0850,  0.1371,  0.0856,  0.0400,  0.0946,  0.1267],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1429, 0.1784, 0.1694, 0.1619, 0.1710, 0.1765], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
779
779
tensor([-0.1076,  0.1096,  0.0399,  0.0641,  0.0734,  0.0975],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1426, 0.1771, 0.1652, 0.1693, 0.1708, 0.1750], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
780
780
tensor([-0.1192,  0.1274,  0.0413,  0.0187,  0.0502,  0.1207],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1417, 0.1813, 0.1664, 0.1626, 0.1679, 0.1801], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
781
781
tensor([-0.0790,  0.1324,  0.0473,  0.0048,  0.0

tensor([-0.0787,  0.1132,  0.0776,  0.0327,  0.0429,  0.1217],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1460, 0.1769, 0.1707, 0.1632, 0.1649, 0.1784], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
839
839
tensor([-0.1086,  0.1458,  0.0513,  0.0516,  0.0319,  0.0730],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1431, 0.1846, 0.1679, 0.1680, 0.1647, 0.1716], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
840
840
tensor([-0.0903,  0.1279,  0.0327,  0.0115,  0.0848,  0.0739],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1459, 0.1815, 0.1650, 0.1616, 0.1739, 0.1720], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
841
841
tensor([-0.0996,  0.1412,  0.0562,  0.0237,  0.0073,  0.1117],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1445, 0.1839, 0.1689, 0.1635, 0.1608, 0.1785], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
842
842
tensor([-0.1007,  0.1368,  0.0505,  0.0458,  0.1104,  0.

Episode 0 of 100 episodes                tensor([-0.1088,  0.1356,  0.0697, -0.0020,  0.0472,  0.0895],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1434, 0.1831, 0.1714, 0.1596, 0.1676, 0.1749], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
902
902
tensor([-0.1355,  0.1299,  0.0307,  0.0586,  0.0509,  0.1192],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1390, 0.1812, 0.1641, 0.1688, 0.1675, 0.1793], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
903
903
tensor([-0.0305,  0.1576,  0.0304,  0.0804,  0.0872,  0.0922],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1505, 0.1817, 0.1600, 0.1682, 0.1694, 0.1702], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
904
904
tensor([-0.1330,  0.1497,  0.0439,  0.0199,  0.0558,  0.1087],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1395, 0.1851, 0.1665, 0.1626, 0.1685, 0.1777], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
905
905
tensor([-0.110

975
975
tensor([-0.0762,  0.1535,  0.0617,  0.0241,  0.0509,  0.0982],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1462, 0.1840, 0.1679, 0.1617, 0.1661, 0.1741], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
976
976
tensor([-0.0961,  0.1440,  0.0621,  0.0321,  0.0595,  0.1315],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1428, 0.1815, 0.1673, 0.1623, 0.1668, 0.1793], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
977
977
tensor([-0.0965,  0.1221,  0.0571,  0.0680,  0.0647,  0.0741],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1439, 0.1790, 0.1678, 0.1696, 0.1690, 0.1706], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
978
978
tensor([-0.1095,  0.1287,  0.0424,  0.0084,  0.0441,  0.0950],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1439, 0.1826, 0.1675, 0.1619, 0.1677, 0.1765], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
979
979
tensor([-0.1149,  0.1172,  0.0597,  0.0622,  0.0

tensor([-0.0491,  0.1242,  0.0701,  0.0407,  0.0563,  0.0577],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1507, 0.1793, 0.1698, 0.1649, 0.1675, 0.1677], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1035
1035
tensor([-0.1230,  0.1678,  0.0520,  0.0316,  0.0693,  0.1022],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1397, 0.1868, 0.1664, 0.1630, 0.1693, 0.1749], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1036
1036
tensor([-0.0842,  0.1268,  0.0740,  0.0131,  0.0370,  0.1034],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1461, 0.1804, 0.1712, 0.1611, 0.1649, 0.1763], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1037
1037
tensor([-0.1010,  0.1234,  0.0440,  0.0281,  0.0583,  0.0940],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1442, 0.1805, 0.1667, 0.1641, 0.1691, 0.1753], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1038
1038
tensor([-0.0618,  0.1484,  0.0477,  0.0342,  0.0

Episode 0 of 100 episodes                tensor([-0.1122,  0.1166,  0.0554,  0.0239,  0.0726,  0.1019],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1423, 0.1789, 0.1683, 0.1631, 0.1712, 0.1763], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1102
1102
tensor([-0.1206,  0.1411,  0.0471,  0.0575,  0.0338,  0.1085],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1408, 0.1829, 0.1665, 0.1683, 0.1643, 0.1771], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1103
1103
tensor([-0.0860,  0.1320,  0.0542,  0.0257,  0.0644,  0.0646],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1463, 0.1819, 0.1683, 0.1635, 0.1700, 0.1700], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1104
1104
tensor([-0.0996,  0.1379,  0.0279,  0.0558,  0.0467,  0.1067],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1437, 0.1822, 0.1632, 0.1679, 0.1663, 0.1766], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1105
1105
tensor

tensor([-0.1084,  0.1485,  0.0318,  0.0362,  0.0761,  0.0904],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1424, 0.1841, 0.1639, 0.1646, 0.1713, 0.1737], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1175
1175
tensor([-0.1207,  0.1540,  0.0791, -0.0010,  0.0671,  0.1183],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1400, 0.1843, 0.1710, 0.1578, 0.1690, 0.1778], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1176
1176
tensor([-0.0882,  0.1245,  0.0483, -0.0089,  0.0551,  0.0786],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1470, 0.1819, 0.1685, 0.1592, 0.1697, 0.1737], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1177
1177
tensor([-0.1015,  0.1671,  0.0642,  0.0541,  0.0113,  0.1162],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([0.1425, 0.1864, 0.1681, 0.1664, 0.1595, 0.1771], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
1178
1178
tensor([-0.0939,  0.1513,  0.0491,  0.0121,  0.0

  returns = torch.tensor(discount_rewards(model.rewards))


[tensor([0.9981], dtype=torch.float64), tensor([0.9369], dtype=torch.float64), tensor([1.0547], dtype=torch.float64), tensor([0.9822], dtype=torch.float64), tensor([1.0936], dtype=torch.float64), tensor([1.1714], dtype=torch.float64), tensor([1.2561], dtype=torch.float64), tensor([1.0891], dtype=torch.float64), tensor([1.1864], dtype=torch.float64), tensor([1.2162], dtype=torch.float64), tensor([1.3406], dtype=torch.float64), tensor([1.5362], dtype=torch.float64), tensor([1.5097], dtype=torch.float64), tensor([1.3520], dtype=torch.float64), tensor([1.4058], dtype=torch.float64), tensor([1.6536], dtype=torch.float64), tensor([1.4967], dtype=torch.float64), tensor([1.7033], dtype=torch.float64), tensor([1.5119], dtype=torch.float64), tensor([1.7658], dtype=torch.float64), tensor([1.5348], dtype=torch.float64), tensor([1.6780], dtype=torch.float64), tensor([1.7495], dtype=torch.float64), tensor([1.7326], dtype=torch.float64), tensor([1.7330], dtype=torch.float64), tensor([1.7061], dtype=t

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
def plot_probs(prob_list, eps_iters_list, batch_size=64, step=1):
    probs_arr = torch.vstack(prob_list).detach().numpy()
    fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,12), dpi=200, sharex=True, sharey=True)
    fig.suptitle('Single Episode Action Probabilities')
    colors = sns.color_palette('Spectral', 7)
    colors = colors[:3] + colors[4:]
    for i, (ax, color) in enumerate(zip(axs.flatten(), colors)):
        sns.lineplot(data=probs_arr[::step,i], color=color, label=action_dict[i], alpha=.7, dashes=False, ax=ax)
    plt.xlabel('Iterations')
    plt.ylabel('Probability')
    plt.tight_layout()
    plt.show()
    return

if record_probs:
    plot_probs(prob_list, eps_iters_list)

In [None]:
def moving_average(a, window_size) :
    ret = np.cumsum(a, dtype=float)
    ret[window_size:] = ret[window_size:] - ret[:-window_size]
    return ret[window_size - 1:] / window_size

def plot_rewards(reward_list, window_size=10):
    plt.figure(figsize=(16,5))
    plt.title('Rewards Over Time')
    plt.ylabel('Total Reward')
    plt.xlabel('Episode Number')
    x = np.arange(0, len(reward_list), 1)
    assert len(x) == len(reward_list)
    plt.plot(x, reward_list, color='black', linestyle='dashed', label='Reward Per Episode')
    plt.plot(x[window_size-1:], moving_average(reward_list, window_size), color='red', label='Reward Moving Average')
    plt.legend()
    plt.show()
    return

print(reward_list)
if record_rewards:
    plot_rewards(reward_list, window_size=200)