In [131]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [132]:
env = gym.make('CartPole-v1')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cpu")

  deprecation(
  deprecation(


In [133]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'terminated'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [134]:
class DQN(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(DQN, self).__init__()
        self.layers = []
        self.layers.append(nn.Linear(input_dim, hidden_dim))
        
        for i in range(num_layers -1): 
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(hidden_dim,hidden_dim))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Linear(hidden_dim, output_dim ))
        self.layers = nn.Sequential(*self.layers)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        return self.layers(torch.tensor(x, device=device, dtype=torch.float))

In [135]:
batchSize = 128
gamma = torch.tensor(0.99)
gamma.to(device)
epsilon = 1
EPS_END = 0.11
EPS_DECAY = 0.99
TARGET_UPDATE = 100

In [136]:
n_actions = env.action_space.n

policy_net = DQN(4, 128, 2, 2).to(device)
target_net = DQN(4, 128, 2, 2).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

DQN(
  (layers): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [137]:
optimizer = optim.Adam(policy_net.parameters(), lr=0.0001)
memory = ReplayMemory(100000)
mse = torch.nn.MSELoss()

In [138]:
def policy(x, epsilon):
    if torch.rand(1) < epsilon:
        return torch.tensor(random.choice([0,1]))
    else:
        return x.max(0)[-1]

In [139]:
observation = env.reset()
policy_net(observation)

tensor([0.1543, 0.0776], grad_fn=<AddBackward0>)

In [140]:
def estimatePerformance():  
    counters = []
    epsilon = -1
    trials = 10
    for i in range(trials):
        counter = 0
        terminated = False
        observation = env.reset()
        while not terminated:
            action = policy_net(observation).max(0)[-1].item()
            observation, reward, terminated,  info = env.step(action)
            counter += reward
        counters.append(counter)

    summ =  0
    for i in range(trials):
        summ += counters[i]
    return summ/trials

In [141]:
from copy import deepcopy
def updateTargetNet():
    target_net = deepcopy(policy_net)
    for param in target_net.parameters():
        param.requires_grad = False

In [142]:
counter = 0
epi_count = 0
cum_reward = 0.0
upd_count = 0
loss = "untrained"
terminated = False
while True:
    if counter % TARGET_UPDATE == 0:
        updateTargetNet()
    while not terminated:
        with torch.no_grad():
            x = policy_net(observation)
        action = policy(x, epsilon)
        next_state, reward, terminated, _ = env.step(action.item())
        if reward != 1:
            print(reward)
        cum_reward += reward
        memory.push(observation, action, next_state, reward, terminated)
        observation = deepcopy(next_state)
        

    if len(memory) >= 10*batchSize:
        transitions = memory.sample(batchSize)
        tup = np.stack([transition for transition in transitions])
        obs = np.stack(tup[:, 0])
        action = np.stack(tup[:, 1])
        obs_next = np.stack(tup[:, 2])
        rew = torch.tensor(np.stack(tup[:, 3]), dtype=torch.float)
        term = torch.tensor(np.stack(tup[:, 4]), dtype=torch.float)
        
        with torch.no_grad():
            q_prime = target_net(obs_next).max(dim=1)[0]
        target = rew + gamma * q_prime * (1 - term)

        q_pred = policy_net(obs)[np.arange(obs.shape[0]), action]
        loss = ((q_pred - target)** 2).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        upd_count += 1
    if len(memory) >=batchSize and (counter % 200 == 0):
        epsilon = max(EPS_END, epsilon*EPS_DECAY)
    if terminated:
        epi_count += 1
        observation = env.reset()
        terminated = False
        print(f"Episode={epi_count}, reward={cum_reward}, loss={loss}, epsilon={epsilon} iter={upd_count}")
        cum_reward = 0.0
    counter += 1

Episode=1, reward=14.0, loss=untrained, epsilon=1 iter=0
Episode=2, reward=14.0, loss=untrained, epsilon=1 iter=0
Episode=3, reward=14.0, loss=untrained, epsilon=1 iter=0
Episode=4, reward=22.0, loss=untrained, epsilon=1 iter=0
Episode=5, reward=18.0, loss=untrained, epsilon=1 iter=0
Episode=6, reward=37.0, loss=untrained, epsilon=1 iter=0
Episode=7, reward=11.0, loss=untrained, epsilon=1 iter=0
Episode=8, reward=55.0, loss=untrained, epsilon=1 iter=0
Episode=9, reward=15.0, loss=untrained, epsilon=1 iter=0
Episode=10, reward=35.0, loss=untrained, epsilon=1 iter=0
Episode=11, reward=23.0, loss=untrained, epsilon=1 iter=0
Episode=12, reward=37.0, loss=untrained, epsilon=1 iter=0
Episode=13, reward=20.0, loss=untrained, epsilon=1 iter=0
Episode=14, reward=41.0, loss=untrained, epsilon=1 iter=0
Episode=15, reward=20.0, loss=untrained, epsilon=1 iter=0
Episode=16, reward=13.0, loss=untrained, epsilon=1 iter=0
Episode=17, reward=15.0, loss=untrained, epsilon=1 iter=0
Episode=18, reward=16.0

  arrays = [asanyarray(arr) for arr in arrays]


Episode=59, reward=58.0, loss=1.008838415145874, epsilon=1 iter=4
Episode=60, reward=10.0, loss=1.0016754865646362, epsilon=1 iter=5
Episode=61, reward=11.0, loss=0.9935786128044128, epsilon=1 iter=6
Episode=62, reward=26.0, loss=0.9838550686836243, epsilon=1 iter=7
Episode=63, reward=21.0, loss=0.9730299711227417, epsilon=1 iter=8
Episode=64, reward=22.0, loss=0.9534358382225037, epsilon=1 iter=9
Episode=65, reward=13.0, loss=0.9442352056503296, epsilon=1 iter=10
Episode=66, reward=17.0, loss=0.93184894323349, epsilon=1 iter=11
Episode=67, reward=48.0, loss=0.9101875424385071, epsilon=1 iter=12
Episode=68, reward=18.0, loss=0.908754825592041, epsilon=1 iter=13
Episode=69, reward=18.0, loss=0.9003610610961914, epsilon=1 iter=14
Episode=70, reward=41.0, loss=0.8973332643508911, epsilon=1 iter=15
Episode=71, reward=14.0, loss=0.8785155415534973, epsilon=1 iter=16
Episode=72, reward=10.0, loss=0.8644495606422424, epsilon=1 iter=17
Episode=73, reward=24.0, loss=0.8506808876991272, epsilon=

In [None]:
transitions = memory.sample(30)
tup = np.stack([transition for transition in transitions])
obs = np.stack(tup[:, 0])
action = np.stack(tup[:, 1])
obs_next = np.stack(tup[:, 2])
rew = torch.tensor(np.stack(tup[:, 3]), dtype=torch.float)
term = torch.tensor(np.stack(tup[:, 4]), dtype=torch.float)

  arrays = [asanyarray(arr) for arr in arrays]


In [None]:
term

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])