<a href="https://colab.research.google.com/github/ninja18/policy_gradients/blob/master/VPGepisode1000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install gym



In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class MLPAgent(nn.Module):
    def __init__(self, output_size,
                 input_size,
                 layer_sizes=[128, 128],
                 dropout_rate=0.5):
        super(MLPAgent, self).__init__()
        self.input_size = input_size
        layer_sizes.insert(0, self.input_size)
        self.layers = nn.ModuleList()

        for i in range(len(layer_sizes) - 1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1], bias=False))
            self.layers.append(nn.ReLU(inplace=True))
        self.layers.append(nn.Dropout(p=dropout_rate))
        self.layers.append(nn.Linear(layer_sizes[-1], output_size))

    def forward(self, x):
        x = x.view(-1, self.input_size)
        for layer in self.layers:
            x = layer(x)
        return F.softmax(x)


In [15]:
torch.cuda.is_available()

True

In [0]:
import os

import gym
import torch
from torch import optim
from torch.distributions import Categorical
from torch.utils.tensorboard import SummaryWriter
from google.colab import files

# from src.VanillaPolicyGradient.agent import MLPAgent


class Trainer:

    def __init__(self, env, obs_dims,
                 agent, lr, epoch,
                 gamma=0.95, batch_size=1,
                 custom_actions=None,
                 model_file="model.pt", save_per_epoch=1,
                 render=False):
        self.env = env
        self.dev = torch.device("cuda:0")
        self.obs_dims = obs_dims
        self.obs_transform = None  # Todo get transform
        # if len(self.obs_dims) > 1:
        #     self.obs_transform = transforms.Compose([
        #         transforms.ToPILImage(),
        #         transforms.Grayscale(num_output_channels=self.obs_dims[2]),
        #         transforms.Resize(self.obs_dims[:2]),
        #         transforms.ToTensor(),
        #     ])
        self.agent = agent
        self.agent.to(self.dev)
        self.optimizer = optim.Adam(self.agent.parameters(), lr=lr)
        self.custom_actions = custom_actions
        self.epoch = epoch
        self.current_epoch = 1
        self.gamma = gamma
        self.batch_size = batch_size
        self.model_file = model_file
        self.render = render
        self.save_per_epoch = save_per_epoch
        self.writer = SummaryWriter()
        self.load_model()
        self.log_probs = []
        self.rewards = []

    def load_model(self):
        if os.path.exists(self.model_file):
            checkpoint = torch.load(self.model_file)
            self.agent.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            self.current_epoch = checkpoint['epoch']
            if self.current_epoch == self.epoch:
                self.epoch += self.epoch
            print(f"Loaded existing model continuing from epoch {self.current_epoch}")

    def act(self, obs):
        if len(obs.shape) == 1:
            probs = self.agent(torch.from_numpy(obs).type(torch.FloatTensor))
        else:
            probs = self.agent(self.pre_process(obs))

        dis = Categorical(probs)
        action = dis.sample()
        self.log_probs.append(dis.log_prob(action))
        if self.custom_actions is not None:
            action = self.custom_actions[action.item()]
        return self.env.step(action.item())

    def pre_process(self, obs):  # Todo preprocess in torch
        # return self.obs_transform(obs).unsqueeze(0)
        obs = obs[35:195]  # crop
        obs = obs[::2, ::2, 0]  # downsample by factor of 2
        obs[obs == 144] = 0  # erase background (background type 1)
        obs[obs == 109] = 0  # erase background (background type 2)
        obs[obs != 0] = 1  # everything else (paddles, ball) just set to 1
        return torch.tensor(obs).type(torch.FloatTensor).to(self.dev)

    def train(self):
        self.agent.train()
        for i in range(self.current_epoch, (self.epoch * self.batch_size) + 1):
            obs = self.env.reset()
            while True:
                obs, rew, done, _ = self.act(obs)
                self.rewards.append(rew)
                if self.render:
                    self.env.render()
                if done:
                    if i % self.batch_size == 0:
                        loss = self.update_policy()
                        total_reward = sum(self.rewards) / self.batch_size
                        epoch = i // self.batch_size
                        print(f"Epoch: {epoch}"
                              f"  Loss: {loss.item()}"
                              f"  Total Average Reward: {total_reward}")
                        self.writer.add_scalar("Loss", loss, epoch)
                        self.writer.add_scalar("reward_per_episode", total_reward, epoch)
                        self.writer.flush()
                        self.rewards.clear()
                        self.log_probs = []
                    break

            if i % self.save_per_epoch == 0:
                torch.save({
                    'epoch': i,
                    'model_state_dict': self.agent.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                }, self.model_file)
            if i % 1000 == 0:
                files.download("model.pt")

        self.env.close()

    def update_policy(self):
        norm_rewards = self.discounted_rewards()
        cumulative_reward = - torch.cat(self.log_probs).to(self.dev) * norm_rewards
        loss = torch.sum(cumulative_reward, -1)
        # viz = make_dot(loss)
        # viz.view()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def discounted_rewards(self):
        reward_accumulator = 0
        discounted_rewards = []
        for r in reversed(self.rewards):
            if r != 0:
                reward_accumulator = 0
            reward_accumulator = r + reward_accumulator * self.gamma
            discounted_rewards.insert(0, reward_accumulator)
        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32).to(self.dev)
        norm_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-06)
        return norm_rewards



In [35]:
import time
env = gym.make("Pong-v0")
agent = MLPAgent(2, 6400, layer_sizes=[200])
trainer = Trainer(env, [80, 80, 1], agent, 0.0005, 1,
                      custom_actions=torch.tensor([2, 3], dtype=torch.int16),
                      save_per_epoch=1,
                      gamma=0.99,
                      batch_size=10, render=False)
start = time.time()
trainer.train()
print(f"One iteration takes : {time.time() - start} seconds")

!ls -al

Loaded existing model continuing from epoch 10




Epoch: 1  Loss: -1.1816844940185547  Total Average Reward: -2.1
One iteration takes : 37.68473505973816 seconds
total 15028
drwxr-xr-x 1 root root     4096 Dec 25 07:22 .
drwxr-xr-x 1 root root     4096 Dec 25 06:29 ..
drwxr-xr-x 1 root root     4096 Dec 18 16:52 .config
-rw-r--r-- 1 root root 15366560 Dec 25 07:23 model.pt
drwxr-xr-x 4 root root     4096 Dec 25 07:23 runs
drwxr-xr-x 1 root root     4096 Dec 18 16:52 sample_data


In [38]:
  env = gym.make("Pong-v0")
  # torch.set_default_dtype(torch.float16)
  agent = MLPAgent(2, 6400, layer_sizes=[200])
  trainer = Trainer(env, [80, 80, 1], agent, 0.0005, 2000,
                    custom_actions=torch.tensor([2, 3], dtype=torch.int8),
                    save_per_epoch=100,
                    gamma=0.99,
                    batch_size=5, render=False)
  trainer.train()



Epoch: 1  Loss: 0.8826713562011719  Total Average Reward: -20.2
Epoch: 2  Loss: -2.1915283203125  Total Average Reward: -20.6
Epoch: 3  Loss: -0.6884956359863281  Total Average Reward: -20.0
Epoch: 4  Loss: -5.493862152099609  Total Average Reward: -19.8
Epoch: 5  Loss: -1.8027667999267578  Total Average Reward: -20.6
Epoch: 6  Loss: -5.0925140380859375  Total Average Reward: -20.4
Epoch: 7  Loss: -4.151176452636719  Total Average Reward: -20.4
Epoch: 8  Loss: 6.0616135597229  Total Average Reward: -20.4
Epoch: 9  Loss: -6.976550102233887  Total Average Reward: -20.6
Epoch: 10  Loss: -5.052742004394531  Total Average Reward: -20.2
Epoch: 11  Loss: -5.033237457275391  Total Average Reward: -20.2
Epoch: 12  Loss: -15.842080116271973  Total Average Reward: -19.8
Epoch: 13  Loss: -7.010899543762207  Total Average Reward: -20.2
Epoch: 14  Loss: -19.20419692993164  Total Average Reward: -20.0
Epoch: 15  Loss: -13.464981079101562  Total Average Reward: -20.0
Epoch: 16  Loss: -12.6760063171386

MessageError: ignored

In [48]:
!ls -al

total 15036
drwxr-xr-x 1 root root     4096 Dec 25 08:44 .
drwxr-xr-x 1 root root     4096 Dec 25 06:29 ..
drwxr-xr-x 1 root root     4096 Dec 18 16:52 .config
-rw-r--r-- 1 root root 15366555 Dec 25 08:40 model.pt
drwxr-xr-x 3 root root     4096 Dec 25 07:26 runs
-rw-r--r-- 1 root root     6869 Dec 25 08:44 runs.zip
drwxr-xr-x 1 root root     4096 Dec 18 16:52 sample_data
