<a href="https://colab.research.google.com/github/ozakiryota/cart_pole/blob/main/cartpole_dqn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cart-Pole


Install OpenAI Gym (https://gym.openai.com)

In [None]:
!pip install gym



Install the packages for visualizing Gym

In [None]:
!apt update
!apt install xvfb
!pip install pyvirtualdisplay

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u[0m[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)[0m                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)[0m                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)[0m                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit

## DQN

Import Gym

In [None]:
import gym

Import the packages for visualizing Gym

In [None]:
import base64
import io
from gym.wrappers import Monitor
from IPython import display
from pyvirtualdisplay import Display

Import required packages

In [None]:
import numpy as np

import torch
from torch import nn
import torch.optim as optim

Net class

In [None]:
class Net(nn.Module):
    def __init__(self, num_states, dim_mid, num_actions):
        super().__init__()

        self.fc = nn.Sequential(
            nn.Linear(num_states, dim_mid),
            nn.ReLU(),
            nn.Linear(dim_mid, dim_mid),
            nn.ReLU(),
            nn.Linear(dim_mid, num_actions)
        )

    def forward(self, x):
        x = self.fc(x)
        return x

Brain class

In [None]:
class Brain:
    def __init__(self, num_states, num_actions, gamma, r, lr):
        self.num_states = num_states
        self.num_actions = num_actions

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print("self.device = ", self.device)
        self.net = Net(num_states, 64, num_actions)
        self.net.to(self.device)
        self.criterion = nn.MSELoss()
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=lr)
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)

        self.eps = 1.0  # for epsilon greedy algorithm
        self.gamma = gamma
        self.r = r
 
    def updateQnet(self, obs_numpy, action, reward, next_obs_numpy):
        obs_tensor = torch.from_numpy(obs_numpy).float()
        obs_tensor.unsqueeze_(0)
        obs_tensor = obs_tensor.to(self.device)

        next_obs_tensor = torch.from_numpy(next_obs_numpy).float()
        next_obs_tensor.unsqueeze_(0)
        next_obs_tensor = next_obs_tensor.to(self.device)

        self.optimizer.zero_grad()

        self.net.train()
        q = self.net(obs_tensor)

        with torch.no_grad():
            self.net.eval()
            labels = self.net(obs_tensor)
            next_q = self.net(next_obs_tensor)

            labels[:, action] = reward + self.gamma*np.max(next_q.cpu().detach().numpy(), axis=1)[0]
        
        loss = self.criterion(q, labels)
        loss.backward()
        self.optimizer.step()

    def getAction(self, obs_numpy, is_training):
        if is_training and np.random.rand() < self.eps:
            action = np.random.randint(self.num_actions)
        else:
            obs_tensor = torch.from_numpy(obs_numpy).float()
            obs_tensor.unsqueeze_(0)
            obs_tensor = obs_tensor.to(self.device)
            with torch.no_grad():
                self.net.eval()
                q = self.net(obs_tensor)
                action = np.argmax(q.cpu().detach().numpy(), axis=1)[0]
        ## update eps
        if is_training and self.eps > 0.1:
            self.eps *= self.r
        return action

Agent class

In [None]:
class Agent:
    def __init__(self, num_states, num_actions, gamma, r, lr):
        self.brain = Brain(num_states, num_actions, gamma, r, lr)
 
    def updateQnet(self, obs, action, reward, next_obs):
        self.brain.updateQnet(obs, action, reward, next_obs)
 
    def getAction(self, obs, is_training):
        action = self.brain.getAction(obs, is_training)
        return action

Environment class

In [None]:
class Environment:
    def __init__(self, num_episodes, max_consecutive_completion, max_step, gamma, r, lr):
        ## parameters
        self.num_episodes = num_episodes
        self.max_consecutive_completion = max_consecutive_completion
        self.max_step = max_step
        ## environment
        self.env = Monitor(gym.make('CartPole-v0'), './videos/', video_callable=(lambda ep: ep % 100 == 0), force=True)
        ## agent
        num_states = self.env.observation_space.shape[0]    # position, velocity, angle, angular velocity
        num_actions = self.env.action_space.n
        self.agent = Agent(num_states, num_actions, gamma, r, lr)

    def train(self):
        consecutive_completion = 0
        
        for episode in range(self.num_episodes):
            obs = self.env.reset()
            episode_reward = 0
 
            for step in range(self.max_step):
                ## get action
                action = self.agent.getAction(obs, is_training=True)
                ## observe next step
                next_obs, _, is_done, _ = self.env.step(action)
                ## get reward
                if is_done:
                    if step < max_step - 1:
                        reward = -1
                        consecutive_completion = 0
                    else:
                        reward = 1
                        consecutive_completion += 1
                else:
                    reward = 0
                episode_reward += reward
                ## update
                self.agent.updateQnet(obs, action, reward, next_obs)
                ## to next step
                obs = next_obs

                if is_done:
                    print('{0} Episode: Finished after {1} time steps with reward {2}'.format(episode, step+1, episode_reward))
                    break
            if consecutive_completion > self.max_consecutive_completion:
                print("It has completed {} consecutive episodes".format(consecutive_completion))
                break

    def evaluate(self):
        obs = self.env.reset()
        
        for step in range(self.max_step):
            ## get action
            action = self.agent.getAction(obs, is_training=False)
            ## observe next step
            next_obs, _, is_done, _ = self.env.step(action)
            ## to next step
            obs = next_obs

            if is_done:
                print('Evaluation: Finished after {} time steps'.format(step+1))
                break

Prepare showing videos of the restults

In [None]:
def show_video(env):
    env.reset()
    for frame in env.videos:
        print("frame = ", frame)
        video = io.open(frame[0], 'r+b').read()
        encoded = base64.b64encode(video)

        display.display(display.HTML(data="""
            <video alt="" controls>
            <source src="data:video/mp4;base64,{0}" type="video/mp4" />
            </video>
            """.format(encoded.decode('ascii')))
        )

Run training

In [None]:
## display
virtual_display = Display()
virtual_display.start()

## parameters
num_episodes = 1000
max_consecutive_completion = 10
max_step = 200
gamma = 0.9
r = 0.99
lr = 0.001

## run
cartpole_env = Environment(num_episodes, max_consecutive_completion, max_step, gamma, r, lr)
cartpole_env.train()
cartpole_env.evaluate()
show_video(cartpole_env.env)

self.device =  cuda:0
0 Episode: Finished after 46 time steps with reward -1
1 Episode: Finished after 23 time steps with reward -1
2 Episode: Finished after 37 time steps with reward -1
3 Episode: Finished after 35 time steps with reward -1
4 Episode: Finished after 24 time steps with reward -1
5 Episode: Finished after 11 time steps with reward -1
6 Episode: Finished after 12 time steps with reward -1
7 Episode: Finished after 9 time steps with reward -1
8 Episode: Finished after 11 time steps with reward -1
9 Episode: Finished after 22 time steps with reward -1
10 Episode: Finished after 11 time steps with reward -1
11 Episode: Finished after 15 time steps with reward -1
12 Episode: Finished after 36 time steps with reward -1
13 Episode: Finished after 146 time steps with reward -1
14 Episode: Finished after 56 time steps with reward -1
15 Episode: Finished after 135 time steps with reward -1
16 Episode: Finished after 58 time steps with reward -1
17 Episode: Finished after 34 time 

frame =  ('/content/videos/openaigym.video.7.8926.video000001.mp4', '/content/videos/openaigym.video.7.8926.video000001.meta.json')


frame =  ('/content/videos/openaigym.video.7.8926.video000008.mp4', '/content/videos/openaigym.video.7.8926.video000008.meta.json')


frame =  ('/content/videos/openaigym.video.7.8926.video000027.mp4', '/content/videos/openaigym.video.7.8926.video000027.meta.json')


frame =  ('/content/videos/openaigym.video.7.8926.video000064.mp4', '/content/videos/openaigym.video.7.8926.video000064.meta.json')


frame =  ('/content/videos/openaigym.video.7.8926.video000125.mp4', '/content/videos/openaigym.video.7.8926.video000125.meta.json')


frame =  ('/content/videos/openaigym.video.7.8926.video000216.mp4', '/content/videos/openaigym.video.7.8926.video000216.meta.json')


## Note
- The reward setting below did not work well.
```
if is_done:
    if step < max_step - 1:
        reward = -100
    else:
        reward = 1
else:
    reward = 1
```
- Adam worked better than RMSprop.

## References
- [minnano_rl/section_4/02_deep_reinforcement_learning.ipynb](https://github.com/yukinaga/minnano_rl/blob/main/section_4/02_deep_reinforcement_learning.ipynb)
- [第15回　CartPole課題で深層強化学習DQNを実装](https://book.mynavi.jp/manatee/detail/id=89831)

## Appendix

In [None]:
l = [
    ["a1", "a2", "a3"],
    ["b1", "b2", "b3"],
    ["c1", "c2", "c3"],
    ["d1", "d2", "d3"]
]
print("l = ", l)
print("*l = ", *l)
print("zip(*l) = ", zip(*l))
print("*zip(*l) = ", *zip(*l))

l =  [['a1', 'a2', 'a3'], ['b1', 'b2', 'b3'], ['c1', 'c2', 'c3'], ['d1', 'd2', 'd3']]
*l =  ['a1', 'a2', 'a3'] ['b1', 'b2', 'b3'] ['c1', 'c2', 'c3'] ['d1', 'd2', 'd3']
zip(*l) =  <zip object at 0x7f2436bd3870>
*zip(*l) =  ('a1', 'b1', 'c1', 'd1') ('a2', 'b2', 'c2', 'd2') ('a3', 'b3', 'c3', 'd3')


In [None]:
import torch

input = torch.tensor([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
indices = torch.tensor([[0, 0, 0],[0, 1, 2],[2, 0, 1]])

print("input = \n", input)
print("indices = \n", indices)

## Switch values of the tensor according to indices of row (dim=0)
print("torch.gather(input=input, dim=0, index=indices) = \n", torch.gather(input=input, dim=0, index=indices))
'''
    input[0][0], input[0][1], input[0][2]
    input[0][0], input[1][1], input[2][2]
    input[2][0], input[0][1], input[1][2]
'''

## Switch values of the tensor according to indices of col (dim=1)
print("torch.gather(input=input, dim=1, index=indices) = \n", torch.gather(input=input, dim=1, index=indices))
'''
    input[0][0], input[0][0], input[0][0]
    input[1][0], input[1][1], input[2][2]
    input[2][2], input[2][0], input[2][1]
'''

input = 
 tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
indices = 
 tensor([[0, 0, 0],
        [0, 1, 2],
        [2, 0, 1]])
torch.gather(input=input, dim=0, index=indices) = 
 tensor([[1, 2, 3],
        [1, 5, 9],
        [7, 2, 6]])
torch.gather(input=input, dim=1, index=indices) = 
 tensor([[1, 1, 1],
        [4, 5, 6],
        [9, 7, 8]])
