In [1]:
from collections import namedtuple
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import TensorDataset, DataLoader

In [2]:
class rollouts_ac(object):
    def __init__(self, batch_size, discount):
        self.batch_size = batch_size
        self.rollout_memory = []
        self.rollout_values = []
        self.batch = []
        self.discount = discount
        self.transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'terminal'))

    def init_episode(self):
        self.trajectory = []
        
    def push_to_trajectory(self, state, action, next_state, reward, terminal):
        self.trajectory.append(self.transition(state, action, next_state, reward, terminal))
        
    def monte_carlo_unfinished(self, value_network):
        values = []
        final_state = self.trajectory[-1].next_state
        with torch.no_grad():
            final_state_val = value_network.forward(final_state)
        for idx, state in enumerate(self.trajectory):
            value = 0 
            for idx_, state_ in enumerate(self.trajectory[idx:]):
                value += self.discount**idx_ * state_.reward
                if (idx + idx_ + 1) == len(self.trajectory):
                    value += self.discount**(idx_+1) * final_state_val
            values.append(value)
        return values
    
    def push_to_memory_unfinished(self, value_network):
        values = self.monte_carlo_unfinished(value_network)
        self.rollout_memory = self.rollout_memory + self.trajectory
        self.rollout_values = self.rollout_values + values
            
    def make_a_batch(self):
        batch = self.rollout_memory[:self.batch_size]
        batch = self.transition(*zip(*batch))
        self.rollout_memory = self.rollout_memory[self.batch_size:]
        values = torch.cat(self.rollout_values[:self.batch_size])
        self.rollout_values = self.rollout_values[self.batch_size:]
        terminal = torch.cat(batch.terminal)
        state = torch.cat(batch.state)
        action = torch.cat(batch.action)
        reward = torch.cat(batch.reward)
        new_state = torch.cat(batch.next_state)
        return state, action, reward, values, new_state, terminal

In [3]:
class deterministic_policy(nn.Module):
    def __init__(self, STATE_DIM, HIDDEN_LIST, ACTION_DIM):
        super().__init__()
        self.layers = nn.ModuleList()
        prev_layer = STATE_DIM
        for layer in HIDDEN_LIST:
            self.layers.append(nn.Linear(prev_layer, layer))
            self.layers.append(nn.ReLU())
            prev_layer = layer
        self.layers.append(nn.Linear(prev_layer, ACTION_DIM))                      

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return F.softmax(x, dim=-1)
        
        
class neural_net(nn.Module):
    def __init__(self, STATE_DIM, HIDDEN_LIST, OUTPUT_DIM):
        #super(neural_net, self).__init__()
        super().__init__()
        self.layers = nn.ModuleList()
        prev_layer = STATE_DIM
        for layer in HIDDEN_LIST:
            self.layers.append(nn.Linear(prev_layer, layer))
            self.layers.append(nn.ReLU())
            prev_layer = layer
        self.layers.append(nn.Linear(prev_layer, OUTPUT_DIM))     

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    

class conditioned_transition_net(nn.Module):
    def __init__(self, STATE_DIM, HIDDEN_LIST, ACTION_DIM):
        #super(conditioned_transition_net, self).__init__()
        super().__init__()
        self.action_dim = ACTION_DIM
        self.layers = nn.ModuleList()
        prev_layer = STATE_DIM
        for idx, layer in enumerate(HIDDEN_LIST):
            if idx == 0:
                if ACTION_DIM == 2:
                    self.layers.append(nn.Linear(prev_layer+1, layer))
                else:
                    self.layers.append(nn.Linear(prev_layer+ACTION_DIM, layer))
            if idx > 0:
                self.layers.append(nn.Linear(prev_layer, layer))
            self.layers.append(nn.ReLU())
            prev_layer = layer
        self.layers.append(nn.Linear(prev_layer, STATE_DIM))   
        
    def forward(self, x, a):
        if self.action_dim == 2:
            x = torch.cat((x, a), 1)
        else:
            extra = torch.zeros([x.size(0), self.action_dim], dtype=torch.float32)
            a_ = torch.tensor(a, dtype=torch.int64)
            extra = extra.scatter(1,a_,1)
            x = torch.cat((x, extra), 1)
        for layer in self.layers:
            x = layer(x)
        return x
    

class conditioned_q_net(nn.Module):
    def __init__(self, STATE_DIM, HIDDEN_LIST, ACTION_DIM):
        #super(conditioned_transition_net, self).__init__()
        super().__init__()
        self.action_dim = ACTION_DIM
        self.layers = nn.ModuleList()
        prev_layer = STATE_DIM
        for idx, layer in enumerate(HIDDEN_LIST):
            if idx == 0:
                if ACTION_DIM == 2:
                    self.layers.append(nn.Linear(prev_layer+1, layer))
                else:
                    self.layers.append(nn.Linear(prev_layer+ACTION_DIM, layer))
            if idx > 0:
                self.layers.append(nn.Linear(prev_layer, layer))
            self.layers.append(nn.ReLU())
            prev_layer = layer
        self.layers.append(nn.Linear(prev_layer, 1))   
        
    def forward(self, x, a):
        if self.action_dim == 2:
            x = torch.cat((x, a), 1)
        else:
            extra = torch.zeros([x.size(0), self.action_dim], dtype=torch.float32)
            a_ = torch.tensor(a, dtype=torch.int64)
            extra = extra.scatter(1,a_,1)
            x = torch.cat((x, extra), 1)
        
        for layer in self.layers:
            x = layer(x)
        return x
        
class VAE(nn.Module):
    def __init__(self, state, hidden, latent):
        super(VAE, self).__init__()
        self.e1 = nn.Linear(state, hidden)
        self.e21 = nn.Linear(hidden, latent)
        self.e22 = nn.Linear(hidden, latent)
        self.d1 = nn.Linear(latent, hidden)
        self.d2 = nn.Linear(hidden, state)
        self.latent = latent

    def encode(self, x):
        out = F.relu(self.e1(x))
        mu = self.e21(out)
        logvar = self.e22(out)
        return mu, logvar

    def reparametrize(self, mu, logvar):
        std = logvar.mul(0.5).exp_()
        eps = torch.FloatTensor(mu.size()).normal_()
        z = mu + eps*std
        return z

    def decode(self, z):
        out = F.relu(self.d1(z))
        out = torch.sigmoid(self.d2(out))
        return out

    def forward(self, x, encoding_only=False, training=True):
        mu, logvar = self.encode(x)
        if training==True:
            z = self.reparametrize(mu, logvar)
        else:
            z = mu
        decoded = self.decode(z)
        if encoding_only==False:
            return decoded, mu, logvar
        else:
            return mu
        
def vae_loss(decoded, x, mu, logvar, beta=1):
    loss_r = nn.BCELoss(size_average=False)
    l_1 = loss_r(decoded, x)
    KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
    l_2 = torch.sum(KLD_element).mul_(-0.5)
    return l_1 + beta*l_2

In [6]:
class ac(object):
    def __init__(self, OBSERVATION_SIZE, REPRESENTATION_SIZE, ACTION_SIZE, BATCH_SIZE, DISCOUNT, clip, learning_rate, policy_net, value_net, encoder, entropy_coef):
        self.cum_rewards = 0
        self.memory = rollouts_ac(BATCH_SIZE, DISCOUNT)
        self.policy_net = policy_net
        self.value_net = value_net
        self.val_loss = torch.nn.L1Loss()
        self.OBSERVATION_SIZE = OBSERVATION_SIZE
        self.REPRESENTATION_SIZE = REPRESENTATION_SIZE
        self.ACTION_SIZE = ACTION_SIZE
        self.clip = clip
        self.steps = 0
        self.o_p = optim.RMSprop(self.policy_net.parameters(), lr=learning_rate)
        self.o_v = optim.RMSprop(self.value_net.parameters(), lr=learning_rate)
        self.encoder = encoder
        self.entropy_coef = entropy_coef
      
    def entropy(self, p_matrix):
        log_probs = torch.log(p_matrix)
        entropy = torch.sum(-p_matrix*log_probs, 1)
        return torch.mean(entropy)
    
    def train_networks(self):
        state_batch, action_batch, reward_batch, value_batch, new_state_batch, terminal_batch = self.memory.make_a_batch()
        # FORWARD
        probs = self.policy_net.forward(state_batch)
        log_probs = torch.log(probs).gather(1, torch.tensor((action_batch-2), dtype=torch.int64))
        critic_values = self.value_net.forward(state_batch)
        # TRAIN POLICY
        self.o_p.zero_grad()
        vs = critic_values.detach()
        advantage = value_batch - vs
        policy_loss = torch.mean(advantage * -log_probs, dim=0)
        #entropy_loss = self.entropy(probs)
        loss_p = policy_loss
        loss_p.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.clip)
        self.o_p.step()
        # TRAIN VALUE
        self.o_v.zero_grad()
        loss_v = self.val_loss(critic_values,value_batch)
        loss_v.backward()
        torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), self.clip)
        self.o_v.step()

    def experiment_training(self, env, episodes, scheduler1, scheduler2):
        results = []
        memory_idx = 18
        for i in range(episodes):
            observation = env.reset()
            self.memory.init_episode()
            episode_reward = 0
            steps_ = 0
            curr_observation = self.preprocess(observation)
            curr_observation = torch.tensor(curr_observation, dtype=torch.float32).reshape(1,80*80)
            prev_observation1 = torch.zeros(self.REPRESENTATION_SIZE).reshape(1,encoder.latent)
            prev_observation2 = torch.zeros(self.REPRESENTATION_SIZE).reshape(1,encoder.latent)
            prev_observation3 = torch.zeros(self.REPRESENTATION_SIZE).reshape(1,encoder.latent)
            with torch.no_grad():
                curr_observation = self.encoder.forward(curr_observation, True, False).detach()
            state = torch.cat([curr_observation, prev_observation1, prev_observation2, prev_observation3], 1)
            while True:
                with torch.no_grad():
                    action_probs = self.policy_net.forward(state).detach().numpy()
                    action = torch.tensor(np.random.choice(self.ACTION_SIZE, p=action_probs.flatten()), dtype=torch.float32).reshape(1,1) + 2
                observation, reward, terminal, _ = env.step(int(action.item()))
                episode_reward += reward
                steps_ += 1
                new_observation = self.preprocess(observation)
                new_observation = torch.tensor(new_observation, dtype=torch.float32).reshape(1,80*80)
                with torch.no_grad():
                    new_observation = self.encoder.forward(new_observation, True, False).detach()
                new_state = torch.cat([new_observation, curr_observation, prev_observation1, prev_observation2], 1)
                done = torch.zeros(1, dtype=torch.float32) if terminal else torch.ones(1, dtype=torch.float32)
                reward = torch.tensor([reward], dtype=torch.float32).reshape(1,1)
                if reward == 1 or reward == -1:
                    memory_idx = 0
                if memory_idx > 18 or memory_idx == 0:
                    self.memory.push_to_trajectory(state, action, new_state, reward, done)
                if len(self.memory.rollout_memory) > self.memory.batch_size:
                    self.train_networks()
                    scheduler1.step()
                    scheduler2.step()
                state = new_state
                prev_observation3 = prev_observation2
                prev_observation2 = prev_observation1
                prev_observation1 = curr_observation
                curr_observation = new_observation
                memory_idx += 1
                if terminal:
                    results.append(episode_reward)
                    self.cum_rewards += episode_reward
                    self.memory.push_to_memory_unfinished(self.value_net)
                    print("\rEp: {} Online reward: {:.2f}; Steps: {}".format(i + 1, episode_reward, steps_), end="")
                    break
        return results
    
    def preprocess(self, state):
        state = state[35:195]
        state = state[::2,::2,0]
        state[state == 144] = 0
        state[state == 109] = 0
        state[state != 0] = 1
        state = self.enlarge_ball(state)
        return state.astype(np.float).ravel()
    
    def enlarge_ball(self, image):
        image = np.copy(image)
        for i in range(1,79):
            for j in range(1,79):
                if image[i,j]==1:
                    if image[i,j+1]==0 and image[i,j-1]==0:
                        if image[i+1,j]==1:
                            image[i,j-1] = 1
                            image[i+1,j-1] = 1
                            image[i,j+1] = 1
                            image[i+1,j+1] = 1
                            if i!=0:
                                image[i-1,j-1:j+2] = 1
                                if i!=1:
                                    image[i-2,j-1:j+2] = 1
                            if i!=78:
                                image[i+2,j-1:j+2] = 1
                                if i!=77:
                                    image[i+3,j-1:j+2] = 1
        return image

In [None]:
SEED = 6

torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

encoder = VAE(6400, 512, 16)
encoder.load_state_dict(torch.load('VAE_MLP_512_16'))

env = gym.make("PongDeterministic-v4")
env.frameskip = 4
LATENT = 16
HIDDEN_AC = [512]
EPOCHS_VAE = 50
EPISODES_TO_GATHER = 10
BATCH_SIZE = 512
OBSERVATION_SIZE = 6400
ACTION_SIZE = 2
LR = 0.001
EPISODES_AC = 1000
ENTROPY_COEF = 0.01

policy_net = deterministic_policy(4*LATENT, HIDDEN_AC, ACTION_SIZE)
value_net = neural_net(4*LATENT, HIDDEN_AC, 1)
ac_agent = ac(OBSERVATION_SIZE, LATENT, ACTION_SIZE, BATCH_SIZE, 0.99, 1, LR, policy_net, value_net, encoder, ENTROPY_COEF)
scheduler1 = optim.lr_scheduler.StepLR(ac_agent.o_p, step_size=10, gamma=0.99)
scheduler2 = optim.lr_scheduler.StepLR(ac_agent.o_v, step_size=10, gamma=0.99)
results = ac_agent.experiment_training(env, EPISODES_AC, scheduler1, scheduler2)
results = np.array(results)
np.savetxt('AC_pong1.csv', results, delimiter=',')

Ep: 2 Online reward: -21.00; Steps: 764



Ep: 4 Online reward: -21.00; Steps: 7642