In [1]:
import gym
import random
import time
import matplotlib.pyplot as plt
import math
import numpy as np
from collections import namedtuple, deque
from itertools import count
from copy import deepcopy
from PIL import Image

%matplotlib inline

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T
from torchsummary import summary

In [3]:
use_cuda = True
print("CUDA Available: ", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

CUDA Available:  True


In [4]:
class ReplayMemory():
    '''
    Replay memory to store states, actions, rewards, dones for batch sampling
    '''
    def __init__(self, capacity):
        '''
        :param capacity: Amount of memory to be stored in the buffer
        '''
        self.buffer = deque(maxlen = capacity)
    
    def add(self, state, action, reward, done, next_state):
        '''
        :param state: current state, atari_wrappers.LazyFrames object
        :param action: action
        :param reward: reward for the action
        :param done: "done" flag is True when the episode finished
        :param next_state: next state, atari_wrappers.LazyFrames object
        '''
        experience = (state, action, reward, done, next_state)
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        '''
        Samples the data from the buffer of a desired size
        
        :param batch_size: sample batch size
        :return: batch of (states, actions, rewards, dones, next states).
                 all are numpy arrays. states and next states have shape of 
                 (batch_size, frames, width, height), where frames = 4.
                 actions, rewards and dones have shape of (batch_size,)
        '''
        if len(self.buffer) < batch_size:
            batch = random.sample(self.buffer, self.count())
        else:
            batch = random.sample(self.buffer, batch_size)
        '''
        batch is a random sample from buffer containing experiences(state, action, reward, done and next_state)
        '''
        state_batch = np.array([np.array(experience[0]) for experience in batch])
        action_batch = np.array([np.array(experience[1]) for experience in batch])
        reward_batch = np.array([np.array(experience[2]) for experience in batch])
        done_batch = np.array([np.array(experience[3]) for experience in batch])
        next_state_batch = np.array([np.array(experience[4]) for experience in batch])
        
        return state_batch, action_batch, reward_batch, done_batch, next_state_batch
    
    def count(self):
        return len(self.buffer)

In [5]:
class DQN(nn.Module):
    '''
    Deep Q-Network
    '''
    def __init__(self, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size =8, stride=4, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0)
        
        self.fc1 = nn.Linear(7*7*64, 512)
        self.fc2 = nn.Linear(512, num_actions)
    
    def forward(self, inputs):
        '''
        Forward propogation
        
        :param inputs: images. expected sshape is (batch_size, frames, width, height)
        '''
        # Change input dimensions to expected size
        inputs = inputs.view(inputs.size(0), -1, inputs.size(1), inputs.size(2))
#         print(inputs.size())
        x = F.relu(self.conv1(inputs))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
        
    

In [23]:
import os
from atari_wrappers import wrap_deepmind
import datetime

class PongAgent():
    def __init__(self):
        '''
        Constructor for the agent
        '''
        self.env = wrap_deepmind(gym.make('PongDeterministic-v4'))
        self.num_actions = self.env.action_space.n
        
        self.dqn = DQN(self.num_actions).to(device)
        self.target_dqn = DQN(self.num_actions).to(device)
        
        summary(self.dqn.cuda(), (84, 84, 4))

        self.buffer = ReplayMemory(1000000)
        
        self.discount_factor = 0.99
        
        self.mse_loss = nn.MSELoss()
        self.optim = optim.RMSprop(self.dqn.parameters(), lr = 0.0001)
        
        self.out_dir = './model'
        
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
            
    def to_var(self, x):
        '''
        Converts x to Variable
        
        :param x: torch Tensor
        :return: torch Variable
        '''
        x_var = Variable(x).to(device)
        return x_var
    
    def predict_q_values(self, states):
        '''
        Compute Q values bypassing states through estimation network
        
        :param states: states, numpy array, the shape is (batch_size, frames, width, height)
        :return: actions, Variable, the shape is (batch_size, num_actions)
        '''
        
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.dqn(states)
        return actions
    
    def predict_target_q_values(self, states):
        '''
        Compute Q values bypassing states through target network
        
        :param states: states, numpy array, the shape is (batch_size, frames, width, height)
        :return: actions, Variable, the shape is (batch_size, num_actions)
        '''
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.target_dqn(states)
        return actions
    
    def select_actions(self, state, epsilon):
        '''
        Select action according to epsilon greedy policy. We will sometimes use 
        our model for choosing the action, and sometimes we will just sample one 
        uniformly.
        
        :param state: state, atari_wrappers.LazyFrames object - list of 4 frames,
                      each is a shape of (1, width, height)
        :param epsilon: epsilon for making choice between random and generated by dqn action
        
        :return: action index
        '''
        choice  = np.random.choice([0, 1], p=(epsilon, 1-epsilon))
        if choice == 0:
            return np.random.choice(range(self.num_actions))
        else:
            state = np.expand_dims(state, 0)
            actions = self.predict_q_values(state)
            return np.argmax(actions.data.cpu().numpy())
        
    def update(self, states, targets, actions):
        '''
        Compute loss and do a backward propogation
        
        :param states: states, numpy array, the shape is (batch_size, frames, width, height)
        :param targets: actions from target network, numpy array the shape is (batch_size)
        :param actions: actions, numpy array, the shape is (batch_size)
        '''
        targets = self.to_var(torch.unsqueeze(torch.from_numpy(targets).float(), -1))
        actions = self.to_var(torch.unsqueeze(torch.from_numpy(actions).long(), -1))
        
        predicted_values = self.predict_q_values(states)
        affected_values = torch.gather(predicted_values, 1, actions)
        loss = self.mse_loss(affected_values, targets)
        
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        
    def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start, epsilon_final):
        '''
        Calculate epsilon value. It cannot be more than epsilon_start and less
        than epsilon final. It is decayed with each step
        
        :param total_steps: total number of step from the training begin
        :param max_epsilon_steps: maximum number of epsilon steps
        :param epsilon_start: start epsilon value, e.g. 1
        :param epsilon_final: final epsilon value, effectively a limit
        :return: calculated epsilon value
        '''
        return max(epsilon_final, epsilon_start - total_steps/max_epsilon_steps)
    
    def sync_target_network(self):
        '''
        Copies weights from estimation to target network
        '''
        primary_params = list(self.dqn.parameters())
        target_params = list(self.target_dqn.parameters())
        for i in range(len(primary_params)):
            target_params[i].data[:] = primary_params[i].data[:]
            
    def calculate_q_targets(self, next_states, rewards, dones):
        '''
        Calculates Q-targets (actions from the target network)
        
        :param next_states: next states, numpy array, shape is (batch_size, frames, width, height)
        :param rewards: rewards, numpy array, shape is (batch_size,)
        :param dones: dones, numpy array, shape is (batch_size,)
        '''
        dones_mask = (dones==1)
        
        predicted_q_target_values = self.predict_target_q_values(next_states)
        next_max_q_value = np.max(predicted_q_target_values.detach().cpu().numpy(), axis=1)
        next_max_q_value[dones_mask] = 0  #No max Q-value of game is over
        q_targets = rewards + self.discount_factor*next_max_q_value
        
        return q_targets
    
    def save_final_model(self):
        '''
        Saves final model to the disk
        '''
        filename = '{}/final_model.pth'.format(self.out_dir)
        torc.save(self.dqn.state_dict(), filename)
        
    def save_model_during_training(self, episode):
        '''
        Saves temporary models to the disk during training
        
        :param episode: episode number
        '''
        filename = '{}/current_model_{}'.format(self.out_dir, episode)
        torch.save(self.dqn.state_dict(), filename)
        
    def load_model(self, filename):
        '''
        Loads model from the disk
        
        :param filename: model filename
        '''
        self.dqn.load_state_dict(torch.load(filename))
        self.sync_target_network()
        
    def play(self, episodes):
        '''
        Plays the game and renders it
        
        :param episodes: number of episodes to play
        '''
        for i in range(1, episodes+1):
            done = False
            state = self.env.reset()
            while not done:
                action = self.select_actions(state, 0)
                state, reward, done, _ = self.env.step(action)
                self.env.render()
                time.sleep(0.01)
                
    def close_env(self):
        '''
        Closes the environment. Should be called to clean-up
        '''
        self.env.close()
    
    def train(self, replay_buffer_fill_len, batch_size, episodes, stop_reward,
              max_epsilon_steps, epsilon_start, epsilon_final, sync_target_net_freq):
        '''
        Trains the network
        
        :param replay_buffer_fill_len: how many elements should replay buffer contain
                                       before training start
        :param batch_size: batch size
        :param episodes: how many episodes (max. value) to iterate
        :param stop_reward: running reward value to be reached. upon reaching that
                            value the training is stoped
        :param max_epsilon_steps: maximum number of epsilon steps
        :param epsilon_start: start epsilon value, e.g. 1
        :param epsilon_final: final epsilon value, effectively a limit
        :param sync_target_net_freq: how often to sync estimation and target networks
        '''
                
        start_time = time.time()
        print('Start training at: '+ time.asctime(time.localtime(start_time)))

        total_steps = 0
        running_episode_reward = 0

        #populate replay memory
        print("Populating replay buffer.... \n")
        state = self.env.reset()
        for i in range(replay_buffer_fill_len):
            action = self.select_actions(state, 1)    #Choose a random action
            next_state, reward, done, _ = self.env.step(action)
            self.buffer.add(state, action, reward, done, next_state)

            state = next_state
            if done:
                self.env.reset()

        print("Replay buffer populated with {} transitions, start training...\n".format(self.buffer.count()))

        for i in range(1, episodes+1):
            #reset the environment
            done = False
            state = self.env.reset()

            #reset episode reward and length
            episode_reward = 0
            episode_length = 0

            #play until it's possible
            while not done:
                #synchronize target network with estimation network in required frequency
                if total_steps%sync_target_net_freq == 0:
                    print("Synchronizing target network...\n")
                    self.sync_target_network()

                #Calculate epsilon and select greedy action
                epsilon = self.get_epsilon(total_steps, max_epsilon_steps, epsilon_start, epsilon_final) 
                action = self.select_actions(state, epsilon)

                #execute action in the environment
                next_state, reward, done, _ = self.env.step(action)
                self.buffer.add(state, action,  reward, done, next_state)

                #sample random minibatch of transcations
                s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample(batch_size)

                #estimate Q value usiing the target network
                q_targets = self.calculate_q_targets(next_s_batch, r_batch, d_batch)

                #update weights in the estimation network
                self.update(s_batch, q_targets, a_batch)

                #set the state for the next action selection and update the reward and counters
                state = next_state
                total_steps += 1
                episode_length += 1
                episode_reward += reward

            running_episode_reward = running_episode_reward*0.9 + 0.1*episode_reward

            if (i % 10) == 0 or (running_episode_reward > stop_reward):
                print('global step: {}'.format(total_steps))
                print('episode: {}'.format(i))
                print('running reward: {}'.format(round(running_episode_reward, 2)))
                print('current epsilon: {}'.format(round(epsilon, 2)))
                print('episode_length: {}'.format(episode_length))
                print('episode reward: {}'.format(episode_reward))
                print('\n')

            if (i % 50) == 0 or (running_episode_reward > stop_reward):
                curr_time = time.time()
                print('current time: ' + time.asctime(time.localtime(curr_time)))
                print('running for: ' + str(datetime.timedelta(seconds=curr_time - start_time)))
                print('saving model after {} episodes...'.format(i))
                print('\n')
                self.save_model_during_training(i)

            if running_episode_reward > stop_reward:
                print('stop reward reached!')
                print('saving final model...')
                print('\n')
                self.save_final_model()
                break

        print('Finish training at: '+ time.asctime(time.localtime(start_time)))                

In [24]:
agent = PongAgent()

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 20, 20]           8,224
            Conv2d-2             [-1, 64, 9, 9]          32,832
            Conv2d-3             [-1, 64, 7, 7]          36,928
            Linear-4                  [-1, 512]       1,606,144
            Linear-5                    [-1, 6]           3,078
Total params: 1,687,206
Trainable params: 1,687,206
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.11
Forward/backward pass size (MB): 0.17
Params size (MB): 6.44
Estimated Total Size (MB): 6.71
----------------------------------------------------------------


In [0]:
agent.train(replay_buffer_fill_len=100, 
            batch_size=32, 
            episodes=10**5,
            stop_reward=19,
            max_epsilon_steps=10**5,
            epsilon_start=1.0,
            epsilon_final=0.02,
            sync_target_net_freq=10000)

Start training at: Sat May  9 00:37:57 2020
Populating replay buffer.... 

Replay buffer populated with 100 transitions, start training...

Synchronizing target network...

global step: 8679
episode: 10
running reward: -13.07
current epsilon: 0.91
episode_length: 990
episode reward: -20.0


Synchronizing target network...

global step: 18332
episode: 20
running reward: -17.48
current epsilon: 0.82
episode_length: 1204
episode reward: -17.0


Synchronizing target network...

global step: 28098
episode: 30
running reward: -19.02
current epsilon: 0.72
episode_length: 1262
episode reward: -20.0


Synchronizing target network...

global step: 37208
episode: 40
running reward: -19.76
current epsilon: 0.63
episode_length: 915
episode reward: -20.0


Synchronizing target network...

global step: 45517
episode: 50
running reward: -20.41
current epsilon: 0.54
episode_length: 762
episode reward: -21.0


current time: Sat May  9 00:43:46 2020
running for: 0:05:49.783471
saving model after 50 episo

In [25]:
agent.load_model('./model/current_model_400')
agent.play(3)
agent.close_env()

