# Unity ML-Agents BC(Behavial Clonning) 구현 코드

## 라이브러리 불러오기

In [1]:
import numpy as np
import random
import datetime
import platform
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import BufferKey, ObservationKeyPrefix

## BC 학습 옵션 값 설정

In [2]:
state_size = 12*4
action_size = 1

load_model = False
train_mode = True

batch_size = 128
discount_factor = 0.9
learning_rate = 3e-4

train_epoch = 500
test_step = 10000

print_interval = 10
save_interval = 100

## Unity 환경 옵션 값 설정

In [3]:
game = "Kart"
os_name = platform.system()
if os_name == "Windows":
    env_name = f"../envs/{game}_{os_name}/{game}"
elif os_name == "Darwin":
    #env_name = f"../envs/{game}_{os_name}"
    env_name = f"../../SimpleKartPrj/{game}/{game}"

## 딥러닝 모델 저장 및 불러오기 옵션

In [4]:
date_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
save_path = f"./saved_models/{game}/BC/{date_time}"
load_path = f"./saved_models/{game}/BC/20221117230105"

## Demonstration 경로

In [5]:
demo_path = "/Users/a08967/SimpleKartPrj/Kart/Demonstrations/Kart.demo"

## 연산 장치(GPU|CPU)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 딥러닝 모델 Class 구현

In [7]:
class Actor(torch.nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = torch.nn.Linear(state_size, 128)
        self.fc2 = torch.nn.Linear(128, 128)
        self.mu = torch.nn.Linear(128, action_size)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        
        return F.tanh(self.mu(x))


## 강화학습 에이전트 Class 구현

In [8]:
class BCAgent:
    def __init__(self):
        self.actor = Actor().to(device)
        self.optimizer = torch.optim.Adam(self.actor.parameters(), lr=learning_rate)
        self.writer = SummaryWriter(save_path)
        
        if load_model == True:
            print(f"... Load Model from {load_path}/ckpt")
            checkpoint = torch.load(load_path+"/ckpt", map_location=device)
            self.actor.load_state_dict(checkpoint["actor"])
            self.optimizer.load_state_dict(checkpoint["optimizer"])
            
    # Actor 네트워크가 행동을 결정
    def get_action(self, state, training=True):
        #네트워크 모드 설정
        self.actor.train(training)
        
        action = self.actor(torch.FloatTensor(state).to(device)).cpu().detach().numpy()       
        return action
    
    # BC 학습을 수행
    def train_model(self, state, action):
        losses = []
        
        rand_idx = torch.randperm(len(state))
        for iter in range(int(np.ceil(len(state)/batch_size))):
            _state = state[rand_idx[iter*batch_size:(iter+1)*batch_size]]
            _action = action[rand_idx[iter*batch_size:(iter+1)*batch_size]]

            action_pred = self.actor(_state)
            loss = F.mse_loss(_action, action_pred).mean()
            
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            losses.append(loss.item())
                
        return np.mean(losses)
                
    # 학습된 BC Network 모델 저장
    def save_model(self):
        print(f"... Save Model to {save_path}/ckpt...")
        torch.save({
            "actor" : self.actor.state_dict(),
            "optimizer" : self.optimizer.state_dict(),
        }, save_path+"/ckpt")
        
    # 학습 로그 기록
    def write_summary(self, loss, epoch):
        self.writer.add_scalar("model/loss", loss, epoch)  

## Main 학습 프로세스

In [9]:
# DDPGAgent Class를 Agent로 정의
agent = BCAgent()

if train_mode:
    # Demonstration 정보 가져오기
    behavior_spec, demo_buffer = demo_to_buffer(demo_path, 1)
    print(demo_buffer._fields.keys())

    demo_to_tensor = lambda key: torch.FloatTensor(demo_buffer[key]).to(device)
    state = demo_to_tensor((ObservationKeyPrefix.OBSERVATION, 0))
    action = demo_to_tensor(BufferKey.CONTINUOUS_ACTION)
    reward = demo_to_tensor(BufferKey.ENVIRONMENT_REWARDS)
    done = demo_to_tensor(BufferKey.DONE)

    ret = reward.clone()
    for t in reversed(range(len(ret) -1)):
        ret[t] += (1. - done[t]) * (discount_factor * ret[t+1])

    # 반환값(return)이 0보다 큰 (state, action) 만 학습에 사용
    state, action = map(lambda x: x[ret > 0], [state, action])

    losses = []
    for epoch in range(1, train_epoch+1):
        loss = agent.train_model(state, action)
        losses.append(loss)

        if epoch % print_interval == 0:
            mean_loss = np.mean(losses)
            agent.write_summary(mean_loss, epoch)
            losses = []

            print(f"{epoch} Epoch / Loss: {mean_loss:.8f}")

        if epoch % save_interval == 0:
            agent.save_model()


... Load Model from ./saved_models/Kart/BC/20221117230105/ckpt


## 학습된 모델 Unity 환경에서 테스트

In [10]:
# 빌드 환경에서 Play 시작
print("PLAY START")

# Unity 환경 경로 설정
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name, side_channels=[engine_configuration_channel])
env.reset()

# Unity 브레인 설정
behavior_name = list(env.behavior_specs.keys())[0]
spec = env.behavior_specs[behavior_name]
engine_configuration_channel.set_configuration_parameters(time_scale=1.0)
dec, term = env.get_steps(behavior_name)

# 테스트 시작
episode, score = 0, 0

for step in range(test_step):
    state = dec.obs[0]
    action = agent.get_action(state, False)
    action_tuple = ActionTuple()
    action_tuple.add_continuous(action)
    env.set_actions(behavior_name, action_tuple)
    env.step()
    
    dec, term = env.get_steps(behavior_name)
    done = len(term.agent_id) > 0
    reward = term.reward if done else dec.reward
    next_state = term.obs[0] if done else dec.obs[0]
    score += reward[0]
    
    if done:
        episode += 1
        
        print(f"{episode} Episode / Step: {step} / Score {score:.2f} ")
        score = 0
        
env.close()

PLAY START
[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-al



1 Episode / Step: 74 / Score -4.18 
2 Episode / Step: 139 / Score -2.91 
3 Episode / Step: 205 / Score -4.93 
4 Episode / Step: 271 / Score -4.94 
5 Episode / Step: 337 / Score -4.94 
6 Episode / Step: 403 / Score -4.94 
7 Episode / Step: 469 / Score -4.94 
8 Episode / Step: 535 / Score -4.94 
9 Episode / Step: 601 / Score -4.94 
10 Episode / Step: 667 / Score -4.94 
11 Episode / Step: 733 / Score -4.94 
12 Episode / Step: 799 / Score -4.94 
13 Episode / Step: 865 / Score -4.94 
14 Episode / Step: 931 / Score -4.94 
15 Episode / Step: 997 / Score -4.94 
16 Episode / Step: 1063 / Score -4.94 
17 Episode / Step: 1129 / Score -4.94 
18 Episode / Step: 1195 / Score -4.94 
19 Episode / Step: 1261 / Score -4.94 
20 Episode / Step: 1327 / Score -4.94 
21 Episode / Step: 1393 / Score -4.94 
22 Episode / Step: 1459 / Score -4.94 
23 Episode / Step: 1525 / Score -4.94 
24 Episode / Step: 1591 / Score -4.94 
25 Episode / Step: 1657 / Score -4.94 
26 Episode / Step: 1723 / Score -4.94 
27 Episode 

KeyboardInterrupt: 