PPO training model on breakout

In [1]:
!pip install gym -q
!pip install torch -q
!pip install numpy -q
!pip install matplotlib -q

Remember to mount your drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#import random
import cv2
import numpy as np
import gym
from gym import Wrapper

from numpy import random

class CustomBreakout(Wrapper):
    def __init__(self, env, size=84, skip=4):
        super(CustomBreakout, self).__init__(env)
        self.skip = skip
        self.size = size
    
    def ProcessFrame(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.size, self.size)) / 255.0
        frame = np.expand_dims(frame, axis=0)
        return frame
    
    def step(self, action):
        total_reward = 0
        for _ in range(self.skip):
            state, reward, done, info = self.env.step(action)
            total_reward += reward
            
            if done:
                break
        state = self.ProcessFrame(state)
        return state, total_reward, done, info

    def reset(self):
        state = self.env.reset()
        state = self.ProcessFrame(state)
        return state
    
class CustomBreakout_stack(Wrapper):
    def __init__(self, env, size=84, skip=4):
        super(CustomBreakout_stack, self).__init__(env)
        self.skip = skip
        self.size = size
        self.history = []

    def ProcessFrame(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.size, self.size)) / 255.0
        return frame
    
    def step(self, action):
        total_reward = 0
        for _ in range(self.skip):
            state, reward, done, info = self.env.step(action)
            total_reward += reward
            
            if done==True:
                break

        state = self.ProcessFrame(state)

        self.history.append(state)
        del self.history[0]

        return np.stack(self.history), total_reward, done, info

    def reset(self):
        state = self.env.reset()
        state = self.ProcessFrame(state)
        self.history = [state, state, state, state]
        return np.stack(self.history)

def CreateBreakout(stack=True):
    env = gym.make('SpaceInvaders-v0')
    if stack:
        env = CustomBreakout_stack(env)
    else:
        env = CustomBreakout(env)
    return env

class MultipleBreakout:
    def __init__(self, N, stack=True):
        self.envs = [CreateBreakout(stack) for _ in range(N)]
    
    def reset(self):
        obs = []
        for env in self.envs:
            ob = env.reset()
            obs.append(ob)
        return np.stack(obs)
    
    def step(self, actions):
        obs, rewards, dones, infos = [], [], [], []
        for env, action in zip(self.envs, actions):
            ob, reward, done, info = env.step(action)
            if done:
                ob = env.reset()
            obs.append(ob)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)
        return np.stack(obs), np.stack(rewards), np.stack(dones), info
    
    def render(self):
        for env in self.envs:
            env.render()





In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.distributions import Categorical

import numpy as np

class QNet(nn.Module):
    def __init__(self):
        super(QNet, self).__init__()

        # layers
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        self.ConvOutSize = self.get_conv_out_size()

        self.fc = nn.Linear(self.ConvOutSize * self.ConvOutSize * 64, 512)

        self.Q = nn.Linear(512, 4)

        self.initialize_weights()
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))

        x = x.view(-1, self.ConvOutSize * self.ConvOutSize * 64)

        x = F.relu(self.fc(x))
        q = self.Q(x)
        return q
    
    def initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)

    def get_conv_out_size(self):
        test_tensor = torch.FloatTensor(1, 4, 84, 84)
        out_tensor = self.conv3(self.conv2(self.conv1(test_tensor)))
        conv_out_size = out_tensor.size()[-1]
        return conv_out_size

class QNet_LSTM(nn.Module):
    def __init__(self):
        super(QNet_LSTM, self).__init__()

        # layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        self.ConvOutSize = self.get_conv_out_size()

        self.lstm = nn.LSTMCell(self.ConvOutSize * self.ConvOutSize * 64, 512)

        self.Q = nn.Linear(512, 4)

        self.initialize_weights()
    
    def forward(self, x, hidden):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))

        x = x.view(-1, self.ConvOutSize * self.ConvOutSize * 64)

        h, c = self.lstm(x, hidden)

        q = self.Q(h)

        return q, (h, c)
    
    def initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.LSTMCell):
                nn.init.constant_(module.bias_ih, 0)
                nn.init.constant_(module.bias_hh, 0)

    def get_conv_out_size(self):
        test_tensor = torch.FloatTensor(1, 1, 84, 84)
        out_tensor = self.conv3(self.conv2(self.conv1(test_tensor)))
        conv_out_size = out_tensor.size()[-1]
        return conv_out_size

class ActorCriticNet(nn.Module):
    def __init__(self):
        super(ActorCriticNet, self).__init__()

        # layers
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        self.ConvOutSize = self.get_conv_out_size()

        self.fc = nn.Linear(self.ConvOutSize * self.ConvOutSize * 64, 512)

        self.Pi = nn.Linear(512, 4)
        self.V = nn.Linear(512, 1)

        self.initialize_weights()
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))

        x = x.view(-1, self.ConvOutSize * self.ConvOutSize * 64)

        x = F.relu(self.fc(x))

        prob = self.Pi(x)
        prob = F.softmax(prob, dim=-1)

        value = self.V(x)

        return prob, value
    
    def initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)

    def get_conv_out_size(self):
        test_tensor = torch.FloatTensor(1, 4, 84, 84)
        out_tensor = self.conv3(self.conv2(self.conv1(test_tensor)))
        conv_out_size = out_tensor.size()[-1]
        return conv_out_size

class ActorCriticNet_LSTM(nn.Module):
    def __init__(self):
        super(ActorCriticNet_LSTM, self).__init__()

        # layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        self.ConvOutSize = self.get_conv_out_size()

        self.lstm = nn.LSTMCell(self.ConvOutSize * self.ConvOutSize * 64, 512)

        self.Pi = nn.Linear(512, 4)
        self.V = nn.Linear(512, 1)

        self.initialize_weights()
    
    def forward(self, x, hidden):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))

        x = x.view(-1, self.ConvOutSize * self.ConvOutSize * 64)

        h, c = self.lstm(x, hidden)

        prob = self.Pi(h)
        prob = F.softmax(prob, dim=-1)

        value = self.V(h)

        return prob, value, (h, c)
    
    def initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.LSTMCell):
                nn.init.constant_(module.bias_ih, 0)
                nn.init.constant_(module.bias_hh, 0)

    def get_conv_out_size(self):
        test_tensor = torch.FloatTensor(1, 1, 84, 84)
        out_tensor = self.conv3(self.conv2(self.conv1(test_tensor)))
        conv_out_size = out_tensor.size()[-1]
        return conv_out_size

In [5]:
#setting up the csv writer for logging
import csv
filename = './drive/MyDrive/ai_group/training_log_SpaceInvaders_rand.csv'

In [6]:
#required imports

import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.distributions import Categorical

import numpy as np
import matplotlib.pyplot as plt
from statistics import mean

#required for timing games
import time

In [7]:
# settings
Train_max_step         = 4000000
learning_rate          = 1e-4
gamma                  = 0.99
lambd                  = 0.95
eps_clip               = 0.1
K_epoch                = 10
N_worker               = 1
T_horizon              = 16
save_interval          = 250
model_path             = './drive/MyDrive/ai_group/Models/SpaceInvaders_rand.model'
history_path           = './drive/MyDrive/ai_group/Train_Historys/SpaceInvaders_rand'

In [8]:
#setup training function

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

def train(Net, optimizer, states, actions, rewards, next_states, dones, old_probs):
    states = torch.FloatTensor(states).view(-1, 4, 84, 84).to(device) # (T*N, 4, 84, 84)
    actions = torch.LongTensor(actions).view(-1, 1).to(device) # (T*N, 1)
    rewards = torch.FloatTensor(rewards).view(-1, 1).to(device) # (T*N, 1)
    next_states = torch.FloatTensor(next_states).view(-1, 4, 84, 84).to(device) # (T*N, 4, 84, 84)
    dones = torch.FloatTensor(dones).view(-1, 1).to(device) # (T*N, 1)
    old_probs = torch.FloatTensor(old_probs).view(-1, 1).to(device) # (T*N, 1)

    for _ in range(K_epoch):
        probs, values = Net(states) # (T*N, num_action), (T*N, 1)
        _, next_values = Net(next_states) # (T*N, 1)

        td_targets = rewards + gamma * next_values * dones #(T*N, 1)
        deltas = td_targets - values # (T*N, 1)

        # calculate GAE
        deltas = deltas.view(T_horizon, N_worker, 1).cpu().detach().numpy() #(T, N, 1)
        masks = dones.view(T_horizon, N_worker, 1).cpu().numpy()
        advantages = []
        advantage = 0
        for delta, mask in zip(deltas[::-1], masks[::-1]):
            advantage = gamma * lambd * advantage * mask + delta
            advantages.append(advantage)
        advantages.reverse()
        advantages = torch.FloatTensor(advantages).view(-1, 1).to(device) # (T*N, 1)

        probs_a = probs.gather(1, actions) #(T*N, 1)

        m = Categorical(probs)
        entropy = m.entropy()

        ratio = torch.exp(torch.log(probs_a) - torch.log(old_probs))
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantages

        actor_loss = -torch.mean(torch.min(surr1, surr2))
        critic_loss = F.smooth_l1_loss(values, td_targets.detach())
        entropy_loss = torch.mean(entropy)

        loss = actor_loss + critic_loss - 0.01 * entropy_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

cuda


Rember to uncomment the lines in the follwing block if you want to load a previous model state!!!

In [9]:
#define main training loop

def main():
    env = MultipleBreakout(N_worker)
    Net = ActorCriticNet().to(device)

    """
    Uncomment this line to load a previous model
    """
    #Net.load_state_dict(torch.load(model_path))                  #Uncomment this line to load a previous model

    optimizer = torch.optim.Adam(Net.parameters(), learning_rate)

    scores = [0.0 for _ in range(N_worker)]
    score_history = []
    train_history = []
    reward_sum = []

    """
    Uncomment the next line if you want to load the previous model
    """  
    #train_history = np.load(history_path+'.npy').tolist()        #Uncomment this line to load a previous model
    
    step = 0

    state = env.reset() # (N, 4, 84, 84)

    print("Train Start")
    start_time = time.perf_counter()

    while step <= Train_max_step:
        states, actions, rewards, next_states, dones, old_probs = list(), list(), list(), list(), list(), list()
        for _ in range(T_horizon):
            prob, _ = Net(torch.FloatTensor(state).to(device))
            m = Categorical(prob)

            action = m.sample() # (N,)
            old_prob = prob.gather(1, action.unsqueeze(1)) # (N, 1)

            #action = action.cpu().detach().numpy()
            action = np.array([random.randint(4)])
            old_prob = old_prob.cpu().detach().numpy()

            next_state, reward, done, info = env.step(action)
            #print(action)
            lives = info['ale.lives']
            reward_sum.append(reward)
            
            if lives == 0:
              end_time = time.perf_counter()
              dur = end_time - start_time
              start_time = end_time
              with open(filename, mode='a') as csv_file:
                csv_writer = csv.writer(csv_file, delimiter=',')
                csv_writer.writerow([step, int(sum(reward_sum)[0]), dur])
              dur_s = "{:.3f} s".format(dur)
              print("Step : {0:7} | Reward : {1:4} | Duration :".format(step, int(sum(reward_sum)[0])), 
                    dur_s.rjust(9,' '))
              reward_sum.clear()
              
 
            # save transition
            states.append(state) # (T, N, 4, 84, 84)
            actions.append(action) # (T, N)
            rewards.append(reward/10.0) # (T, N)
            next_states.append(next_state) # (T, N, 4, 84, 84)
            dones.append(1-done) # (T, N)
            old_probs.append(old_prob)# (T, N, 1)


            # record score and check done
            for i, (r, d) in enumerate(zip(reward, done)):
                scores[i] += r

                if d==True:
                    score_history.append(scores[i])
                    scores[i] = 0.0
                    if len(score_history) > 250:
                        del score_history[0]

            state = next_state

            step += 1

            if step % save_interval == 0:
                torch.save(Net.state_dict(), model_path)
                train_history.append(mean(score_history))
                np.save(history_path, np.array(train_history))
                 # print("step : {}, Average score of last 250 episode : {:.1f}".format(step, mean(score_history)))

        train(Net, optimizer, states, actions, rewards, next_states, dones, old_probs)

    torch.save(Net.state_dict(), model_path)
    np.save(history_path, np.array(train_history))
     # print("Train end, avg_score of last 100 episode : {}".format(mean(score_history)))

In [None]:
if __name__ == "__main__":
    main()

Train Start
Step :     160 | Reward :   60 | Duration :   3.383 s
Step :     286 | Reward :   35 | Duration :   5.091 s
Step :     425 | Reward :   15 | Duration :   2.769 s
Step :     664 | Reward :  120 | Duration :   4.697 s
Step :     802 | Reward :   70 | Duration :   2.805 s
Step :    1006 | Reward :  125 | Duration :   3.833 s
Step :    1160 | Reward :   25 | Duration :   3.094 s
Step :    1278 | Reward :   10 | Duration :   2.483 s
Step :    1454 | Reward :   95 | Duration :   3.357 s
Step :    1550 | Reward :   20 | Duration :   1.912 s
Step :    1676 | Reward :   15 | Duration :   2.458 s
Step :    1829 | Reward :   35 | Duration :   3.113 s
Step :    2061 | Reward :  155 | Duration :   4.377 s
Step :    2161 | Reward :   10 | Duration :   2.144 s
Step :    2317 | Reward :   25 | Duration :   2.888 s
Step :    2539 | Reward :   90 | Duration :   4.387 s
Step :    2644 | Reward :   30 | Duration :   2.165 s
Step :    2983 | Reward :  565 | Duration :   6.519 s
Step :    3130 |