In [1]:
import numpy as np
import torch
import torch.multiprocessing as mp
import time
import copy 
from AC_modules.Networks import *
from AC_modules.AdvantageActorCritic import SharedAC, IndependentAC

from Utils import test_env

debug = False
queue = False

In [2]:
def build_AC(model_dict): # works
    if model_dict['shared']:
        return SharedAC(model_dict['model'], *model_dict['args'], **model_dict['kwargs'])
    else:
        return IndependentAC(model_dict['model'], *model_dict['args'], **model_dict['kwargs'])
        
def play_episode(agent, env, max_steps):

    # Start the episode
    state = env.reset()
    if debug: print("state.shape: ", state.shape)
    rewards = []
    log_probs = []
    distributions = []
    states = [state]
    done = []
    bootstrap = []
        
    steps = 0
    while True:
     
        action, log_prob, distrib = agent.get_action(state, return_log = True)
        new_state, reward, terminal, info = env.step(action)
        if debug: print("state.shape: ", new_state.shape)
        rewards.append(reward)
        log_probs.append(log_prob)
        distributions.append(distrib)
        states.append(new_state)
        done.append(terminal)
        
        # Still unclear how to retrieve max steps from the game itself
        if terminal is True and steps == max_steps:
            bootstrap.append(True)
        else:
            bootstrap.append(False) 
        
        if terminal is True:
            #print("steps: ", steps)
            #print("Bootstrap needed: ", bootstrap[-1])
            break
            
        state = new_state
        steps += 1
        
    rewards = np.array(rewards)
    states = np.array(states)
    if debug: print("states.shape: ", states.shape)
    done = np.array(done)
    bootstrap = np.array(bootstrap)

    return rewards, log_probs, distributions, np.array(states), done, bootstrap

def random_start(X=10, Y=10):
    s1, s2 = np.random.choice(X*Y, 2, replace=False)
    initial = [s1//X, s1%X]
    goal = [s2//X, s2%X]
    return initial, goal



In [3]:
def training_thread(global_model, model_dict, game_params, learning_rate, n_episodes, max_steps, random_init, rank):
    print("Entered process %d"%rank)
    #local_model = model_constructor.generate_model()
    #print("Constructed local model ")
    #print("model_dict: ", model_dict)
    #local_model = build_AC(model_dict)
    local_model = copy.deepcopy(global_model)
    print("Constructed local model ")
    local_model.load_state_dict(global_model.state_dict())
    print("Loaded state dictionary")
    optimizer = torch.optim.Adam(global_model.parameters(), lr=learning_rate)
    print("created optim")
    
    print("Process %d started"%rank)
    
    performance = []
    steps_to_solve = []
    for e in range(n_episodes):
        
        if random_init:
            # Change game params
            initial, goal = random_start(game_params["x"], game_params["y"])

            # All game parameters
            game_params["initial"] = initial
            game_params["goal"] = goal

        env = test_env.Sandbox(**game_params)
        rewards, log_probs, distributions, states, done, bootstrap = play_episode(local_model, env, max_steps)
        global_model.env_steps += len(rewards) # hope it works asynchronously - TO CHECK
        
        performance.append(np.sum(rewards))
        steps_to_solve.append(len(rewards))
        
        #if (e+1)%10 == 0:
        #    print("Episode %d of process %d - reward: %.2f - steps to solve: %.2f"%(e+1, rank, np.mean(performance[-10:]), np.mean(steps_to_solve[-10:])))
        print("Episode %d of process %d - reward: %.2f - steps to solve: %.2f"%(e+1, rank, performance[-1], steps_to_solve[-10:]))
        
        critic_loss, actor_loss, entropy = local_model.compute_ac_loss(rewards, log_probs, distributions, states, done, bootstrap)
        loss = critic_loss + actor_loss
        
        # Update global model and then copy back updated params to local model
        optimizer.zero_grad()
        loss.mean().backward()
        for global_param, local_param in zip(global_model.parameters(), local_model.parameters()):
            global_param._grad = local_param.grad
        optimizer.step()
        global_model.optim_steps += 1 # hope it works asynchronously - TO CHECK
        global_model.update_target()
        local_model.load_state_dict(global_model.state_dict())
        
    print("Training process {} reached maximum episode.".format(rank))
    
    


In [4]:
def test_thread(global_model, game_params, tot_episodes, max_steps, random_init, Q=None, episodes_per_test=10, test_every=4):
    print("Test process started")
    test_counter = 0
    max_tests = tot_episodes // test_every
    
    performance = []
    steps_to_solve = []
    critic_losses = [] 
    actor_losses = []
    entropies = []
    while True:
        if global_model.optim_steps > test_counter*test_every:
            test_counter +=1
            episode_reward = []
            episode_steps = []
            for e in range(episodes_per_test):
                if random_init:
                    # Change game params
                    initial, goal = random_start(game_params["x"], game_params["y"])

                    # All game parameters
                    game_params["initial"] = initial
                    game_params["goal"] = goal

                env = test_env.Sandbox(**game_params)
                rewards, log_probs, distributions, states, done, bootstrap = play_episode(agent, env, max_steps)
                episode_reward.append(np.sum(rewards))
                episode_steps.append(len(rewards))
                
                if e == 0:
                    critic_loss, actor_loss, entropy = agent.compute_ac_loss(rewards, log_probs, distributions, states, done, bootstrap)
            performance.append(np.mean(episode_reward))
            steps_to_solve.append(np.mean(episode_steps))
            print("Test %d - reward %.2f - steps to solve %.2f"%(test_counter, performance[-1], steps_to_solve[-1]))
            critic_losses.append(critic_loss)
            actor_losses.append(actor_loss)
            entropies.append(entropy)
        else:
            #time.sleep(1) # wait 1 sec
            pass
        if test_counter == max_tests:
            break
    #if queue:
    #    Q.put(performance, steps_to_solve, critic_losses, actor_losses, entropies) # TO CHECK

In [5]:
def train_sandbox(agent_constructor, learning_rate, game_params, n_training_threads=3, n_episodes=1000,
                  max_steps=120, return_agent=False, random_init=True):
    
    global_model = agent_constructor.generate_model()
    global_model.share_memory()
    
    model_dict = dict(model=agent_constructor.model,
                      shared=agent_constructor.shared,
                      args=agent_constructor.args,
                      kwargs=agent_constructor.kwargs)
    processes = []
    for rank in range(1, n_training_threads + 1):  # + 1 for test process
        if rank == 0:
            p = mp.Process(target=test_thread, args=(global_model, game_params, n_episodes*n_training_threads, 
                                                     max_steps, random_init,))
        else:
            #local_model = agent_constructor.generate_model()
            p = mp.Process(target=training_thread, args=(global_model, model_dict, game_params, 
                                                         learning_rate, n_episodes, max_steps, random_init, rank),)
        p.start()
        processes.append(p)
    print("All processes started")
    for p in processes:
        p.join()
    print("All processes finished")

    if return_agent:
        return global_model

In [6]:
from AC_modules.Constructor import *

In [7]:
relational_HPs = torch.load("Results/Sandbox/Supervised/best_HP_S_chosen-residual_UMUT")
relational_HPs

{'n_kernels': 36,
 'n_features': 256,
 'n_heads': 1,
 'n_attn_modules': 2,
 'feature_hidden_dim': 16,
 'feature_n_residuals': 1}

In [8]:
name = "GatedBoxWorldNet"
action_space = 4
#n_features = 16

HPs = dict(gamma=0.99, tau=0.3, n_steps=5, H=1e-3, **relational_HPs)
learning_rate = 1e-4

if torch.cuda.is_available():
    HPs['device'] = 'cuda'
else:
    HPs['device'] = 'cpu'
    
print("Using device "+HPs['device'])
agent_constructor = ActorCriticConstructor(name, True, action_space, **HPs)

Using device cpu
Model:  <class 'AC_modules.Networks.GatedBoxWorldNet'>
self.model:  <class 'AC_modules.Networks.GatedBoxWorldNet'>
self.shared:  True
self.args:  (4,)
self.kwargs:  {'gamma': 0.99, 'tau': 0.3, 'n_steps': 5, 'H': 0.001, 'n_kernels': 36, 'n_features': 256, 'n_heads': 1, 'n_attn_modules': 2, 'feature_hidden_dim': 16, 'feature_n_residuals': 1, 'device': 'cpu'}


In [9]:
# Variable parameters
X = 5
Y = 5
initial = [0,0]
goal = [4,4]
MAX_STEPS = 20

game_params = dict(x=X, y=Y, initial=initial, goal=goal, max_steps=MAX_STEPS, 
                   greyscale_state=True, return_ohe=True)

In [10]:
model = train_sandbox(agent_constructor, learning_rate, game_params, n_training_threads=2, n_episodes=100,
                  max_steps=MAX_STEPS, return_agent=True, random_init=True)

model:  <class 'AC_modules.Networks.GatedBoxWorldNet'>
action_space:  4
n_features:  256
HPs:  {'n_kernels': 36, 'n_heads': 1, 'n_attn_modules': 2, 'feature_hidden_dim': 16, 'feature_n_residuals': 1}
device:  cpu
params and buffers check
[Parameter containing:
tensor([[[[ 0.0628, -0.2637],
          [ 0.1979, -0.2660]],

         [[-0.0814,  0.1895],
          [ 0.2067, -0.1509]],

         [[-0.2547, -0.1768],
          [ 0.1381,  0.0704]]],


        [[[ 0.0357, -0.1480],
          [ 0.1125, -0.0944]],

         [[-0.2850, -0.2019],
          [ 0.1650, -0.2126]],

         [[-0.0084,  0.0121],
          [ 0.1339,  0.1854]]],


        [[[ 0.2623, -0.1416],
          [ 0.2881, -0.2124]],

         [[ 0.0634, -0.0315],
          [ 0.2399,  0.0823]],

         [[ 0.1186,  0.0663],
          [-0.1900,  0.0279]]],


        [[[ 0.1644,  0.0489],
          [ 0.0829, -0.1409]],

         [[ 0.1766,  0.0492],
          [-0.1040,  0.1378]],

         [[-0.1267, -0.2113],
          [-0.1801, -

Entered process 1
Entered process 2
All processes started
Constructed local model 
Constructed local model 
Loaded state dictionary
created optim
Process 2 started
Loaded state dictionary
created optim
state.shape:  (3, 7, 7)
Process 1 started
state.shape:  (3, 7, 7)
state.shape:  (3, 7, 7)
states.shape:  (2, 3, 7, 7)


Process Process-1:


state.shape:  (3, 7, 7)


Traceback (most recent call last):
  File "/home/nicola/anaconda3/envs/torch/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/nicola/anaconda3/envs/torch/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)


states.shape:  (2, 3, 7, 7)


Process Process-2:
Traceback (most recent call last):
  File "<ipython-input-3-186c9a8d6d31>", line 37, in training_thread
    print("Episode %d of process %d - reward: %.2f - steps to solve: %.2f"%(e+1, rank, performance[-1], steps_to_solve[-10:]))
TypeError: must be real number, not list
  File "/home/nicola/anaconda3/envs/torch/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/nicola/anaconda3/envs/torch/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-3-186c9a8d6d31>", line 37, in training_thread
    print("Episode %d of process %d - reward: %.2f - steps to solve: %.2f"%(e+1, rank, performance[-1], steps_to_solve[-10:]))
TypeError: must be real number, not list


All processes finished
