In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

from PIL import Image
import pandas as pd
import numpy as np
import os
import gym, random
from multiprocessing import Process, Queue
import itertools, cma

In [None]:
device = "cpu"
print(f"Device: {device}")

# INIT VAE and MDNRNN models

## VAE

In [None]:
class Encoder(nn.Module):
    def __init__(self, latent_dim):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=3,
                               out_channels=32,
                               kernel_size=4,
                               stride=2
                              )
        self.conv2 = nn.Conv2d(in_channels=32,
                               out_channels=64,
                               kernel_size=4,
                               stride=2
                              )
        self.conv3 = nn.Conv2d(in_channels=64,
                               out_channels=128,
                               kernel_size=4,
                               stride=2
                              )
        self.conv4 = nn.Conv2d(in_channels=128,
                               out_channels=256,
                               kernel_size=4,
                               stride=2
                              )
        
        self.fc_mu = nn.Linear(in_features=2*2*256, out_features=latent_dim)
        self.fc_logvar = nn.Linear(in_features=2*2*256, out_features=latent_dim)
        
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.activation(self.conv1(x))
        x = self.activation(self.conv2(x))
        x = self.activation(self.conv3(x))
        x = self.activation(self.conv4(x))
        x = x.view(x.shape[0], -1)
        x_mu = self.fc_mu(x)
        x_logvar = self.fc_logvar(x)
        
        return x_mu, x_logvar

In [None]:
class Decoder(nn.Module):
    def __init__(self, latent_dim):
        super().__init__()
        self.fc = nn.Linear(in_features=latent_dim, out_features=1024)
        
        self.conv4 = nn.ConvTranspose2d(in_channels=1024,
                                       out_channels=128,
                                       kernel_size=5,
                                       stride=2)
        self.conv3 = nn.ConvTranspose2d(in_channels=128,
                                       out_channels=64,
                                       kernel_size=5,
                                       stride=2)
        self.conv2 = nn.ConvTranspose2d(in_channels=64,
                                       out_channels=32,
                                       kernel_size=6,
                                       stride=2)
        self.conv1 = nn.ConvTranspose2d(in_channels=32,
                                       out_channels=3,
                                       kernel_size=6,
                                       stride=2)
        
        self.ReLU_activation = nn.ReLU()
        
    def forward(self, x):
        x = self.fc(x)
        x = x.view(x.shape[0], 1024, 1, 1)
        x = self.ReLU_activation(self.conv4(x))
        x = self.ReLU_activation(self.conv3(x))
        x = self.ReLU_activation(self.conv2(x))
        x = torch.sigmoid(self.conv1(x))
        return x
        

In [None]:
class VariationalAutoencoder(nn.Module):
    def __init__(self, latent_dim):
        super().__init__()
        self.encoder = Encoder(latent_dim)
        self.decoder = Decoder(latent_dim)
        
    def forward(self, x):
        latent_mu, latent_logvar = self.encoder(x)
        latent = self.latent_sample(latent_mu, latent_logvar)
        x_recon = self.decoder(latent)
        return x_recon, latent_mu, latent_logvar
        
    def latent_sample(self, mu, logvar):
        if self.training:
            std = (logvar * 0.5).exp()
            return torch.distributions.Normal(loc=mu, scale=std).rsample()
        else:
            return mu

## MDNRNN

In [None]:
class MDNRNN(nn.Module):
    def __init__(self, hidden_units, z_dim, num_layers, n_gaussians):
        super().__init__()
        self.hidden_units = hidden_units
        self.z_dim = z_dim
        self.num_layers = num_layers
        self.n_gaussians = n_gaussians
        self.hidden = None
        self.cell = None
        
        # RNN
        self.lstm = nn.LSTM(self.z_dim+1, self.hidden_units, batch_first=True)
        
        # MDN
        # weights for the results of the gaussians
        self.z_pi = nn.Linear(self.hidden_units, self.n_gaussians*(self.z_dim))
        # parameters of the gaussians
        self.z_sigma = nn.Linear(self.hidden_units, self.n_gaussians*(self.z_dim))
        self.z_mu = nn.Linear(self.hidden_units, self.n_gaussians*(self.z_dim))
        
    
    def forward(self, x):
        if self.hidden == None and self.cell == None:
            z, state = self.lstm(x)
            self.hidden, self.cell = state
        else:
            z, state = self.lstm(x, (self.hidden, self.cell))
            self.hidden, self.cell = state
        z, _ = nn.utils.rnn.pad_packed_sequence(z, batch_first=True)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        seq_len = x.shape[1]
        
        pi = self.z_pi(z).view(-1, seq_len, self.n_gaussians, self.z_dim)
        # transform the pi values to let them sum to 1
        pi = F.softmax(pi, dim=2)
        # transform sigmas with exponential to ensures they are all positive
        sigma = torch.exp(self.z_sigma(z)).view(-1, seq_len, self.n_gaussians, self.z_dim)
        # compute mus
        mu = self.z_mu(z).view(-1, seq_len, self.n_gaussians, self.z_dim)
        return pi, sigma, mu
    
    def reset_state(self):
        self.hidden = None
        self.cell = None

# Controller

In [None]:
class Controller(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        # input dimensions is size_z + size_h
        self.input_dim = input_dim
        self.fc = nn.Linear(input_dim, 5)
        
    def forward(self, x):
        x = torch.sigmoid(self.fc(x))
        return x
    

# CMAES

In [None]:
class CMAES():
    def __init__(self, input_dim, popsize, q_inp, q_res, sigma=0.10):
        self.popsize = popsize
        self.input_dim = input_dim
        self.num_parameters = (self.input_dim + 1) * 5
        self.temp_best_parameters = None
        self.temp_best_fitness = 0
        # population is a list of lists of type: [controllers, list of parameters]
        self.population = [[Controller(input_dim), None] for i in range(self.popsize)]
        # queues for the concurrent execution
        self.q_inp = q_inp
        self.q_res = q_res
        # init cma-es solver
        self.solver = cma.CMAEvolutionStrategy(self.num_parameters * [0], 0.10, {"popsize" : self.popsize})
        # init population
        self.sample_population()
        
    def start_optimization(self):
        # execute the optimization and terminates when a fitness score of >= 50 is obtained
        
        controller_parameters, fitness = self.run_step()
        epochs = 0
        print(f"EPOCH: {epochs} FITNESS: {fitness}")
        epochs += 1
        while(fitness < 50):
            # run the optimization step
            controller_parameters, fitness = self.run_step()
            
            # save fitness score and parameters if they are better
            if fitness > self.temp_best_fitness:
                self.temp_best_fitness = fitness
                self.temp_best_parameters = controller_parameters
                
            print(f"EPOCH: {epochs} FITNESS: {fitness} BEST FITNESS {self.temp_best_fitness}")
            epochs += 1
        return self.temp_best_parameters
        
        
    def run_step(self):
        # compute the fitness scores and update the population

        # run in parallel all the controllers
        for controller in self.population:
            q_inp.put(controller)
        
        # retrieve the controllers' fitness score
        fitness_scores = []
        finished_controllers_param = []
        num_finished = 0
        while(num_finished != self.popsize):
            # obtain result
            controller, fitness = q_res.get()
            # save fitness scores as negative sign, solver performs minimization
            fitness_scores.append(-fitness)
            # append controller parameters to associate them to fitness score
            finished_controllers_param.append(controller[1])
            num_finished += 1
            print(f"FINISHED: {num_finished}/{self.popsize}")
        
        # update solver
        self.solver.tell(finished_controllers_param, fitness_scores)
    
        # sample new population
        self.sample_population()
        
        # return solver parameters and best fitness score
        return self.solver.result[0], -self.solver.result[1]
        
    def sample_population(self):
        # generate population from the parameters samples returned by the solver
        
        # obtain the new population's parameters
        new_samples_parameters = self.solver.ask()
        
        # create the new population
        for x in range(self.popsize):
            # associate new parameters to controller
            self.population[x][1] = new_samples_parameters[x]
            
            # transform parameters from numpy to torch tensor
            new_parameters = torch.from_numpy(new_samples_parameters[x]).reshape(-1, self.input_dim + 1).to(device)
            
            # create new state dict
            new_state_dict = {
                "fc.weight" : new_parameters[:, :-1],
                "fc.bias" : new_parameters[:, -1]
            }
            # load new parameters into controller
            self.population[x][0].load_state_dict(new_state_dict)

# Optimization

## Definition of the function to be runned in parallel

In [None]:
def proc_function(q_inp, q_res, vae, mdnrnn):
    # run 30 rollouts
    rollouts = 30
    
    # controller is a tuple (controller, list of parameters)
    controller = q_inp.get()
    
    # run while a signal to terminate is not received from the queue
    while (controller != None):
        fitness_scores = []
        for i in range(rollouts):
            fitness = run_rollout(controller[0], vae, mdnrnn)
            fitness_scores.append(fitness)
            
            print(f"ROLLOUT: {i}/{rollouts}")
            
        # return the tuple (controller, parameters) and the average fitness score of the rollouts
        q_res.put((controller, sum(fitness_scores)/rollouts))
         
        controller = q_inp.get()

In [None]:
vae = VariationalAutoencoder(latent_dim=512).to(device)

lr = 1e-3
hidden_units = 1024
z_dim = 512
n_gaussians = 16

mdnrnn = MDNRNN(hidden_units=hidden_units, z_dim=z_dim, num_layers=1, n_gaussians=n_gaussians).to(device)

controller = Controller(512+1024).to(device)
#run_rollout(

In [None]:
def run_rollout(controller, vae, mdnrnn):
    # runs a single rollout using the models and returns the cumulative reward
    
    # init mdnrnn
    mdnrnn.reset_state()
    
    # init gym environment
    env = gym.make("procgen:procgen-leaper-v0", start_level=0, num_levels=0, render_mode="rgb_array")
    obs = env.reset()
    
    # map actions to procgen actions
    action_to_procgen = {0: 2, #left,
                    1: 3, #down,
                    2: 5, #up,
                    3: 7, #right,
                    4: 9, #idle
                   }
    
    # start with idle action
    action = 9 
    
    # execute rollout
    terminated = False
    cumulative_reward = 0

    # init hidden state of LSTM
    hidden_state_rnn = torch.zeros(1, 1, mdnrnn.hidden_units)
    
    with torch.no_grad():
        while not terminated:
            # compute latent vector of observation
            z, _ = vae.encoder((torch.from_numpy(obs).permute(2, 0, 1).float()/255).unsqueeze(0).to(device))

            # concat z and hidden state
            input_controller = torch.cat((z, hidden_state_rnn.squeeze(0)), dim=1).to(device)

            # execute controller and retrieve action with max value
            res = controller(input_controller)
            action = action_to_procgen[torch.argmax(res).item()]

            # perform action and retrieve environment informations
            obs, reward, terminated, info = env.step(action)
            cumulative_reward += reward
            
            # pack input for mdnrnn
            input_rnn = torch.cat((z, torch.Tensor([[action]]).to(device)), dim=1)
            packed_input = nn.utils.rnn.pack_padded_sequence(input_rnn.unsqueeze(0), [1], batch_first=True, enforce_sorted=False)
            
            # execute mdnrnn and take the hidden state
            mdnrnn(packed_input)
            hidden_state_rnn = mdnrnn.hidden
            
            
    return cumulative_reward

In [None]:
def init_processes(q_inp, q_res, vae, mdnrnn):
    # function which initialize the parallel processes
    
    processes = []
    for x in range(num_processes):
        processes.append(Process(target=proc_function, args=(q_inp, q_res, vae, mdnrnn)))
        processes[x].start()
    return processes

## Execute Optimization

In [None]:
# load the models
vae = torch.load("./BACKUP_MODELS/vae").to(device)
mdnrnn = torch.load("./BACKUP_MODELS/mdnrnn").to(device)

# define number of concurrent processes
num_processes = 2

# init queues
q_inp = Queue()
q_res = Queue()

# init concurrent processes
processes = init_processes(q_inp, q_res, vae, mdnrnn)

# init optimizer and run it
es = CMAES(512+1024, 30, q_inp, q_res)
best_parameters = es.start_optimization()

# kill the concurrent processes
for x in processes:
    x.kill()

## Save the best controller

In [None]:
best_controller = Controller(512+1024)

# fitness of >=50 is not reached, here I am saving the temporary best controller
new_parameters = torch.from_numpy(es.temp_best_parameters).reshape(-1, 512+1024 + 1).to(device)

# create new state dict
new_state_dict = {
    "fc.weight" : new_parameters[:, :-1],
    "fc.bias" : new_parameters[:, -1]
}

# load new parameters into controller
best_controller.load_state_dict(new_state_dict)

# save the model
torch.save(best_controller, f"./BACKUP_MODELS/controller")

# Run test 

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

In [None]:
# load the models
vae = torch.load("./BACKUP_MODELS/vae").to(device)
mdnrnn = torch.load("./BACKUP_MODELS/mdnrnn").to(device)
controller = torch.load(f"./BACKUP_MODELS/controller").to(device)

In [None]:
def run_test_rollout(controller, vae, mdnrnn):
    # runs a single rollout and return 1 if win, 0 if lose
    
    mdnrnn.reset_state()
    env = gym.make("procgen:procgen-leaper-v0", start_level=0, num_levels=0, render_mode="rgb_array")
    obs = env.reset()
    action_to_procgen = {0: 2, #left,
                    1: 3, #down,
                    2: 5, #up,
                    3: 7, #right,
                    4: 9, #idle
                   }
    
    # start with idle action
    action = 9 
    terminated = False

    # init hidden state of LSTM
    hidden_state_rnn = torch.zeros(1, 1, mdnrnn.hidden_units)
    
    with torch.no_grad():
        while not terminated:
            # compute latent vector of observation
            z, _ = vae.encoder((torch.from_numpy(obs).permute(2, 0, 1).float()/255).unsqueeze(0).to(device))

            # concat z and hidden state
            input_controller = torch.cat((z, hidden_state_rnn.squeeze(0)), dim=1).to(device)

            # execute controller and retrieve action with max value
            res = controller(input_controller)
            action = action_to_procgen[torch.argmax(res).item()]

            # perform action and retrieve environment informations
            obs, reward, terminated, info = env.step(action)
            cumulative_reward += reward
            
            # pack input for mdnrnn
            input_rnn = torch.cat((z, torch.Tensor([[action]]).to(device)), dim=1)
            packed_input = nn.utils.rnn.pack_padded_sequence(input_rnn.unsqueeze(0), [1], batch_first=True, enforce_sorted=False)
            
            # execute mdnrnn and take the hidden state
            mdnrnn(packed_input)
            hidden_state_rnn = mdnrnn.hidden
    
    if reward > 0:
        return 1
    else:
        return 0

In [None]:
# execute the rollouts and retun the number of wins
num_rollouts = 100
num_win = 0
for i in range(num_rollouts):
    num_win += run_test_rollout(controller, vae, mdnrnn)
    print(f"ROLLOUT: {i+1}/{num_rollouts} NUM WINS: {num_win}")
    
print(f"NUMBER OF WINS: {num_win}/{num_rollouts}")