In [23]:
from environment import Environment
from agent import Agent

import random
import numpy as np
import math
from collections import deque
import os
import sys

import torch
import torch.nn.functional as F
import torch.optim as optim

from agent import Agent
#from dqn_model import DQN

torch.manual_seed(595)
np.random.seed(595)
random.seed(595)

In [24]:
# There are 3 convolutional layers followed by
# 2 fully-connected layers. The first convolutional layer has
# 32 8×8 filters with stride 4, the second 64 4×4 filters with
# stride 2, and the third and final convolutional layer consists
# 64 3 × 3 filters with stride 1. As shown in Figure 1, the
# dueling network splits into two streams of fully connected
# layers. The value and advantage streams both have a fullyconnected layer with 512 units. The final hidden layers of
# the value and advantage streams are both fully-connected
# with the value stream having one output and the advantage
# as many outputs as there are valid actions2
# . We combine the
# value and advantage streams using the module described by
# Equation (9). Rectifier non-linearities (Fukushima, 1980)
# are inserted between all adjacent layers.

import torch.nn as nn
import torch.nn.functional as F


class DuelingDQN(nn.Module):
    """Initialize a deep Q-learning network

    Hints:
    -----
        Original paper for DQN
    https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf

    This is just a hint. You can build your own structure.
    """

    def __init__(self, in_channels=4, num_actions=4):
        
        """
        Parameters:
        -----------
        in_channels: number of channel of input.
                i.e The number of most recent frames stacked together, here we use 4 frames, which means each state in Breakout is composed of 4 frames.
        num_actions: number of action-value to output, one-to-one correspondence to action in game.

        You can add additional arguments as you need.
        In the constructor we instantiate modules and assign them as
        member variables.
        """
        super(DuelingDQN, self).__init__()
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(in_features=64*7*7, out_features=512)
        self.V = nn.Linear(in_features=512, out_features=1)
        self.A = nn.Linear(in_features=512, out_features=num_actions)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        V = self.V(x)
        A = self.A(x)
        Q = V + (A - A.mean(dim=1, keepdim=True))
        return Q
        ###########################
        return x


In [51]:
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN,self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        
        #Gym parameters
        self.num_actions = env.action_space.n
        
        #Buffer
        self.buffer_max_len = 20000
        self.buffer = deque(maxlen=self.buffer_max_len)
        
        #Training Parameters
        self.num_episodes = 20000
        self.batch_size  =32
        self.learning_rate = 1.5e-4
        self.steps_done = 0
        self.target_update = 5000
        self.step_start_learning = 5000
        self.gamma = 0.999
        self.epsilon_start = 1
        self.epsilon_end = 0.025
        self.epsilon_decay_steps = 100000
        self.epsilon = 1
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        #TODO: Print this out and see this
        self.delta_epsilon = (self.epsilon_start - self.epsilon_end)/self.epsilon_decay_steps
           
        #Model
        self.policy_net = DuelingDQN().to(self.device)
        self.target_net = DuelingDQN().to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        
        #Values to be printed
        self.episode_reward_list = []
        self.moving_reward_avg = []
        
        
#         if args.test_dqn:
#             #you can load your model here
#             ###########################
#             # YOUR IMPLEMENTATION HERE #
    
    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        
        ###########################
        pass
    
    
    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            
            observation = torch.tensor(observation, dtype=torch.float, device=self.device).permute(2,0,1).unsqueeze(0)
            
            #TODO: Change this to test_net?
            if test:
                return self.target_net(observation).max(1)[1].item()
            
            result = np.random.uniform()         
            
            if result > self.epsilon:
                return self.policy_net(observation).max(1)[1].item()
            else:
                return self.env.action_space.sample()     
        ###########################
        pass
        
    def push(self, state, reward, action, next_state, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.buffer.append((state, reward, action, next_state, done))
        ###########################
        pass
        
    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        batch = random.sample(self.buffer, batch_size)
        states = []
        rewards = []
        actions = []
        next_states = []
        dones = []
        for sample in batch:
            state, reward, action, next_state, done = sample
            states.append(state)
            rewards.append(reward)
            actions.append(action)
            next_states.append(next_state)
            dones.append(done)
        ###########################
        return states, rewards, actions, next_states, dones

    def update(self):
        
        if len(self.buffer) < self.step_start_learning:
            return     
        
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.delta_epsilon
        
        states, rewards, actions, next_states, dones = self.replay_buffer(self.batch_size)
        loss = self.compute_loss(states, rewards, actions, next_states, dones)
        self.optimizer.zero_grad()
        loss.backward()
        # Check this
        for param in self.policy_net.parameters():
            param.grad.data.clamp(-1,1)
        self.optimizer.step()          

    def compute_loss(self, states, rewards, actions, next_states, dones):
        non_final_mask = [not done for done in dones]
        
        states = torch.tensor(states, dtype=torch.float).permute(0,3,1,2).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float).permute(0,3,1,2).to(self.device)
        dones = torch.tensor(dones, dtype=torch.long).to(self.device)        
        
        Q_current = self.policy_net.forward(states).gather(1, actions.unsqueeze(1))
        Q_current = Q_current.squeeze(1)

        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(next_states[non_final_mask]).max(1)[0].detach()
        expected_state_action_values = (next_state_values * self.gamma) + rewards
        
        loss = F.smooth_l1_loss(Q_current, expected_state_action_values)
        
        del states, rewards, actions, next_states, dones, Q_current, next_state_values, expected_state_action_values
        
        return loss
        
    def test(self): 
        test_env = Environment('BreakoutNoFrameskip-v4', None, atari_wrapper=True, test=True)
        rewards = []
        seed = 11037
        total_episodes=30
        test_env.seed(seed)
        for i in range(total_episodes):
            state = test_env.reset()
            done = False
            episode_reward = 0.0

            #playing one game
            while(not done):
                action = agent.make_action(state, test=True)
                state, reward, done, info = test_env.step(action)
                episode_reward += reward

            rewards.append(episode_reward)
        print('Run %d episodes'%(total_episodes))
        print('Mean:', np.mean(rewards))
    
    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        for episode in range(self.num_episodes):
            
            observation = self.env.reset() / 255
            episode_steps = 0
            episode_reward = 0
            done = False
            
            while not done:

                action = self.make_action(observation, test=False)
                new_observation, reward, done, _ = env.step(action)
                new_observation = new_observation / 255
                
                episode_reward += reward
                episode_steps += 1
                self.steps_done += 1
                
                self.push(observation, reward, action, new_observation, done)
                
                self.update()
                
                observation = new_observation
                
                if self.steps_done % self.target_update == 0:
                    self.target_net.load_state_dict(self.policy_net.state_dict())
            
            self.episode_reward_list.append(episode_reward)
            self.moving_reward_avg.append(np.average(np.array(self.episode_reward_list[-30:])))

            if episode % 100 == 0:
                print('episode: {} average reward: {} episode length: {}'.format(episode,
                                                                        self.moving_reward_avg[-1],
                                                                        episode_steps))
                torch.save(self.policy_net.state_dict(), 'test_model.pt')
            
            if episode % 500 == 0:
                self.test()
        self.moving_reward_avg = np.array(self.moving_reward_avg)
        np.savetxt("rewards.csv", self.moving_reward_avg, delimiter=",")            
        print("Done")

Done


In [50]:
env_name = 'BreakoutNoFrameskip-v4'
env = Environment(env_name, None, atari_wrapper=True)
agent = Agent_DQN(env, None)
agent.train()
torch.save(agent.target_net, 'test_model.pt')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m
episode: 0 average reward: 2.0 episode length: 121
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m
Run 30 episodes
Mean: 0.3
episode: 25 average reward: 0.3076923076923077 episode length: 23
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m
Run 30 episodes
Mean: 0.3
episode: 50 average reward: 0.4666666666666667 episode length: 23
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.B

KeyboardInterrupt: 

In [31]:
1.1e-6

1.1e-06