In [1]:
import minerl
import gym
import argparse
import torch
import torch.optim as optim
import torch.nn as nn
from torch.distributions import Categorical
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
import numpy as np
import yaml
import os
from copy import deepcopy
import argparse



In [2]:
from wrapper.framestack import FrameBuffer
from wrapper.preprocess import PreprocessAtari

def make_env():
    env = gym.make("MineRLTreechop-v0")
    return env


In [3]:
def make_11action(self, env, action_index):
    # Action들을 정의
    action = env.action_space.noop()
    # Cameras
    if (action_index == 0):
        action['camera'] = [0, 0]
    elif (action_index == 1):
        action['camera'] = [0, -5]
    elif (action_index == 2):
        action['camera'] = [0, 5]
    elif (action_index == 3):
        action['camera'] = [-5, 0]
    elif (action_index == 4):
        action['camera'] = [5, 0]

    # Forwards
    elif (action_index == 5):
        action['forward'] = 0
    elif (action_index == 6):
        action['forward'] = 1

    # Jump
    elif (action_index == 7):
        action['jump'] = 0
    elif (action_index == 8):
        action['jump'] = 1

    # Attack 
    elif (action_index == 9):
        action['attack'] = 0
    elif (action_index == 10):
        action['attack'] = 1

def save_model(self, model):
        torch.save({'model_state_dict': model.state_dict()}, './PPO.pth')
        print("model saved")

In [4]:
def converter(observation):
    obs = observation['pov']
    obs = obs / 255.0
    obs = torch.from_numpy(obs)
    obs = obs.permute(2, 0, 1)
    return obs.float().cuda()


In [5]:
#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20

class PPO(nn.Module):
    def __init__(self, num_actions):
        super(PPO, self).__init__()
        self.num_actions = num_actions
        self.data = []
        
        self.conv_layers = nn.Sequential(
        nn.Conv2d(4, 32, kernel_size=8, stride=4),
        nn.BatchNorm2d(32),
        nn.LeakyReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.BatchNorm2d(64),
        nn.LeakyReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.BatchNorm2d(64),
        nn.LeakyReLU(),
        )

        def conv2d_size_out(size, kernel_size=3, stride=2):
            return (size - (kernel_size - 1) - 1) // stride + 1

        conv_size = conv2d_size_out(64, 8, 4)
        conv_size = conv2d_size_out(conv_size, 4, 2)
        conv_size = conv2d_size_out(conv_size, 3, 1)
        linear_input_size = conv_size * conv_size * 64 # 4 x 4 x 64 = 1024

        self.fc_pi = nn.Linear(linear_input_size, self.num_actions)
        self.fc_v = nn.Linear(linear_input_size, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim=0):
        if len(x.shape) < 4:
          x = x.unsqueeze(0)
        x = self.conv_layers(x)
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        if len(x.shape) < 4:
          x = x.unsqueeze(0)
        x = self.conv_layers(x)
        v = self.fc_v(x)
        return v
      
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, prob_a, done = transition
            
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])
            
        s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                          torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                          torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a
        
    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(K_epoch):
            td_target = r + gamma * self.v(s_prime) * done_mask
            delta = td_target - self.v(s)
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)

            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentionScore(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(AttentionScore, self).__init__()
        self.linear1 = nn.Linear(input_dim, 256)
        self.W = nn.Linear(hidden_dim, 256, bias=False)
        self.Z = 1.0
        self.linear2 = nn.Linear(256, output_dim)

    def forward(self, v, h):
        x = F.tanh(self.linear1(v) + self.W(h))
        out = torch.exp((self.linear2(x)))/ self.Z
        return out

class SoftAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SoftAttention, self).__init__()
        self.attention_score = AttentionScore(input_dim, hidden_dim, output_dim)
    
    def forward(self, v, h):
        print(v.shape)
        print(h.shape)
        scores = self.attention_score(v, h)
        print(scores.shape)
        z = torch.matmul(v.T, scores)
        return z.T

v = torch.randn(4, 6)
h = torch.randn(4, 8)
softattention = SoftAttention(6, 8, 1)
context = softattention(v, h)
print(v)
print(context)
print(context.shape)




torch.Size([4, 6])
torch.Size([4, 8])
torch.Size([4, 1])
tensor([[ 0.0705, -2.2701, -0.7842,  1.2794, -0.1607,  0.2796],
        [-0.8574,  0.6490, -1.6332,  0.5814, -1.5433,  0.2243],
        [ 1.6867,  0.5200,  2.3955,  0.2559, -0.3255,  0.1514],
        [ 0.3080, -2.0061, -0.9902, -1.2209, -0.3632,  1.3364]])
tensor([[ 1.2522, -3.5641, -1.2512,  0.1079, -2.2517,  2.3826]],
       grad_fn=<PermuteBackward0>)
torch.Size([1, 6])




In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
total_episodes = 10
print_interval = 20
env = gym.make("MineRLTreechop-v0")
model = PPO(num_actions=11)


for n_epi in range(total_episodes):
    score = 0.0
    s = env.reset()
    done = False
    while not done:
        for t in range(T_horizon):
            prob = model.pi(converter(s))
            m = Categorical(prob)
            a = m.sample().item()
            action = make_11action(env, a)
            s_prime, r, done, info = env.step(action)

            model.put_data((s, a, r, s_prime, prob[a].item(), done))
            s = s_prime
            score += r
            if done:
                print("# of episode :{}, score : {:.1f}".format(n_epi, score))
                break

        model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        score = 0.0

env.close()

save_model(model)
   

UnicodeDecodeError, switching to default encoding
Exception in thread Thread-3:
Traceback (most recent call last):
  File "c:\Users\ye200\anaconda3\envs\minerl\lib\site-packages\minerl\env\malmo.py", line 562, in log_to_file
    linestr = line.decode(mine_log_encoding)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 51: invalid start byte

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\ye200\anaconda3\envs\minerl\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "c:\Users\ye200\anaconda3\envs\minerl\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\ye200\anaconda3\envs\minerl\lib\site-packages\minerl\env\malmo.py", line 566, in log_to_file
    linestr = line.decode(mine_log_encoding)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 51: invalid start byte



RuntimeError: Given groups=1, weight of size [32, 4, 8, 8], expected input[1, 3, 64, 64] to have 4 channels, but got 3 channels instead