# Architecture testing with supervised learning

In this notebook I want to test the well-functioning of the various architectures proposed for the actor and the critic.

The setup is the following:
1. Generate a series of states s
2. Use optimal policy to associate them with an optimal action* a 
3. Train only the actor architecture with cross-entropy loss

*one could also use the optimal probabilities if known, but anyway if done in an unbiased way, the sampling procedure will be enough to learn stochastic policies.

Since we already have an optimal policy implemented for sandbox environment and all the code ready for playing episodes, we can build a training set starting from trajectories and then sample simple (s,a) tuples.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from RelationalModule import ActorCritic, ControlActorCritic, CoordActorCritic, OheActorCritic
from RelationalModule import GatedActorCritic
from Utils import train_agent_sandbox as train
from Utils import test_env, utils, plot
from importlib import reload

In [None]:
reload(test_env)

# Generating data for supervised learning

In [None]:
def play_optimal(env):
    state = env.reset(random_init = False)
    
    actions = []
    states = []

    while True:
        action = env.get_optimal_action()
        actions.append(action)
        
        new_state, reward, terminal, info = env.step(action) 
        states.append(new_state)
        
        if terminal:
            break
            
        state = new_state
    
    return actions, states

In [None]:
# Variable parameters
X = 5
Y = 5
initial = [0,0]
goal = [2,2]
MAX_STEPS = 100

game_params = dict(x=X, y=Y, initial=initial, goal=goal, max_steps=MAX_STEPS, 
                   greyscale_state=True, return_ohe=True)

In [None]:
env = test_env.Sandbox(**game_params)
actions, states = play_optimal(env)

In [None]:
def random_start(X=10, Y=10):
    s1, s2 = np.random.choice(X*Y, 2, replace=False)
    initial = [s1//X, s1%X]
    goal = [s2//X, s2%X]
    return initial, goal

In [None]:
def create_action_state_set(game_params, size = 10000):
    action_memory = []
    state_memory = []
    
    while len(action_memory) < size:
        
        # Change game params
        initial, goal = random_start(game_params["x"], game_params["y"])

        # All game parameters
        game_params["initial"] = initial
        game_params["goal"] = goal
        
        env = test_env.Sandbox(**game_params)
        
        actions, states = play_optimal(env)
        action_memory += actions
        state_memory += states
        
        #print('len(action_memory): ',len(action_memory))
        
    return np.array(state_memory[:size]), np.array(action_memory[:size])

In [None]:
%%time 
state_set, action_set = create_action_state_set(game_params)

# Using torch utils to create data loaders

In [None]:
from torch.utils.data import DataLoader, Dataset, TensorDataset, SubsetRandomSampler

In [None]:
class NumpyDataset(Dataset):
    """
    Add description
    """
    
    def __init__(self, data, label):
        self.data = data
        self.label = label
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return (self.data[index], self.label[index])

In [None]:
def prepare_dataset(x, label, train_perc, val_perc, train_batch_size, val_batch_size, test_batch_size):
    """
    Add description
    """
    
    # training/test splitting
    m = int(len(x)*train_perc)
    x_train= x[:m]
    y_train = label[:m]
    x_test =  x[m:]
    y_test = label[m:]
    
    # define custom NumpyDatasets
    train_set = NumpyDataset(x_train, y_train)
    test_set =  NumpyDataset(x_test, y_test)
   
    train_len = int(m*(1-val_perc))
    train_sampler = SubsetRandomSampler(np.arange(train_len))
    val_sampler = SubsetRandomSampler(np.arange(train_len,m))

    train_loader = DataLoader(train_set, train_batch_size, sampler=train_sampler, drop_last=True, collate_fn=lambda x: x)
    val_loader = DataLoader(train_set, val_batch_size, sampler=val_sampler, drop_last=True, collate_fn=lambda x: x)
    test_loader = DataLoader(test_set, test_batch_size, drop_last=False, collate_fn=lambda x: x)

    return train_loader, val_loader, test_loader

In [None]:
batch_size = {'train_batch_size':16, 'val_batch_size':64, 'test_batch_size':128}
train_loader, val_loader, test_loader = prepare_dataset(state_set, action_set, 0.8, 0.2, **batch_size)

# Training actor net

In [None]:
from RelationalModule import AC_networks as nets
import torch.nn as nn
import torch.optim as optim
import time

In [None]:
actor_net = nets.GatedBoxWorldActor(action_space=4)

In [None]:
loss_fn = nn.NLLLoss()

for i, data in enumerate(train_loader, 0):
    x = [x[0] for x in data]
    x = torch.tensor(x).float() #.to(device)
    y =  [x[1] for x in data]
    y = torch.LongTensor(y) #.to(device)
    print(x.shape)
    print(y.shape)
    y_pred = actor_net(x)
    print(y_pred.shape)
    loss = loss_fn(y_pred, y)
    print(loss)
    break

In [None]:
### Testing function
def test_epoch(net, dataloader, loss_fn, optimizer):

    # select device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
        
    # Validation
    net.eval() # Evaluation mode (e.g. disable dropout)
    with torch.no_grad(): # No need to track the gradients
        batch_len = np.zeros(len(dataloader))
        batch_loss = np.zeros(len(dataloader))
        for i, data in enumerate(dataloader,0):
            # Extract data and move tensors to the selected device
            x = [x[0] for x in data]
            x = torch.tensor(x).float().to(device)
            
            y =  [x[1] for x in data]
            y = torch.LongTensor(y).to(device)

            y_pred = net(x)

            loss = loss_fn(y_pred, y)
            
            # save MSE loss and length of a batch
            batch_len[i] = len(data)
            batch_loss[i] = loss.item()
    
    # total loss
    val_loss = (batch_loss*batch_len).sum()/batch_len.sum()
    return val_loss

In [None]:
def train_NN(net, lr, n_epochs, train_loader, val_loader, train_log=True, verbose=True, 
                  debug=False, return_model = False):
    """
    Trains a Pytorch model.
    
    Parameters
    ----------
    model: Pytorch nn.Module class 
        Must have forward method
    train_loader: torch DataLoader
        Loads the training set
    val_loader: torch DataLoader
        Loads the validation set
    verbose: bool
        If True prints updates of the training 10 times for each epoch
    return_model: bool
        If True returns the trained instance of the model 
    **params: dictionary 
        Must contain all the parameters needed by the model, the optimizer and the loss
    
    Returns
    -------
    net (if return_model): Pytorch nn.Module class
        Trained instance of the model 
    train_loss_log (if train_log): list
        Training loss for each epoch
    val_loss_log (if train_log): list
        Validation loss for each epoch
    val_acc_log (if train_log): list
        Validation accuracy for each epoch
    
    """
  
    optimizer = optim.Adamax(net.parameters(), lr)
    loss_fn = nn.NLLLoss()
    
    # define contextual print functions activated by print flags
    verbose_print = print if verbose else lambda *a, **k: None
    verbose_print("Verbose: ", verbose)
    dprint = print if debug else lambda *a, **k: None
    dprint("Debug: ", debug)

    # If cuda is available set the device to GPU
    verbose_print("Using cuda: ", torch.cuda.is_available())
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    # Move all the network parameters to the selected device (if they are already on that device nothing happens)
    net.to(device)
    
    n_batches = len(train_loader)
    epoch_time = []
    #Time for printing
    training_start_time = time.time()
    # lists with the history of the training
    if (train_log == True):
        train_loss_log = []
        val_loss_log = []

    #Loop for n_epochs
    for epoch in range(n_epochs):

        running_loss = 0.0
        print_every = n_batches // 10 # frequency of printing
        start_time = time.time()
        total_train_loss = 0
        batches_done = 0
        net.train() # activate dropout
        for i, data in enumerate(train_loader, 0):
            batches_done += 1
            optimizer.zero_grad()
            
            x = [x[0] for x in data]
            x = torch.tensor(x).float().to(device)
            
            y =  [x[1] for x in data]
            y = torch.LongTensor(y).to(device)

            y_pred = net(x)

            loss = loss_fn(y_pred, y)
            # Backward pass
            loss.backward()
            optimizer.step()

            #Print statistics
            running_loss += loss.item() 
            total_train_loss += loss.item()
            #Print every 10th batch of an epoch
            if ((i+1) % (print_every) == 0) or (i == n_batches - 1):
                verbose_print('\r'+"Epoch {}, {:d}% \t Train loss: {:.4f} took: {:.2f}s ".format(
                        epoch+1, int(100 * (i+1) / n_batches), running_loss / batches_done,
                        time.time() - start_time), end=' ')
                
        epoch_time.append(time.time() - start_time)
        if (train_log == True):
            train_loss_log.append(total_train_loss/len(train_loader))
        
        
        #At the end of the epoch, do a pass on the validation set
        val_loss = test_epoch(net, dataloader=val_loader, loss_fn=loss_fn, optimizer=optimizer) 
        if (train_log == True):
            val_loss_log.append(val_loss)
            verbose_print("Val. loss: {:.4f}".format(val_loss ))

    verbose_print("Training finished, took {:.2f}s".format(time.time() - training_start_time))
    if train_log:
        if return_model:
            return net, train_loss_log, val_loss_log#, val_acc_log
        else:
            return train_loss_log, val_loss_log#, val_acc_log  #used during cross validation

In [None]:
lr = 1e-4
n_epochs = 10
ohe_actor_net = nets.OheActor(action_space=4, map_size=5) #check mapsize

In [None]:
ohe_results = train_NN(ohe_actor_net, lr, n_epochs, train_loader, val_loader)

In [None]:
lr = 1e-4
n_epochs = 10
relational_actor_net = nets.GatedBoxWorldActor(action_space=4)

In [None]:
relational_results = train_NN(relational_actor_net, lr, n_epochs, train_loader, val_loader)