# TME11 - RLD - GAIL

## Luiz J. B Pinheiro \& Matheus M. CENTA

# Imports

In [1]:
!nvidia-smi

Sat Feb 27 13:14:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 166...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   45C    P8     1W /  N/A |     10MiB /  5944MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!conda install -y swig
!pip install Box2D
!pip install box2d-py

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [3]:
import os
import pickle
import time
import matplotlib
import gym
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Chargement des données (expert)

In [3]:
!wget http://dac.lip6.fr/wp-content/uploads/2021/01/expert.zip -O expert.zip
!unzip -o expert.zip
!rm expert.zip

--2021-02-27 13:05:04--  http://dac.lip6.fr/wp-content/uploads/2021/01/expert.zip
Resolving dac.lip6.fr (dac.lip6.fr)... 132.227.201.10
Connecting to dac.lip6.fr (dac.lip6.fr)|132.227.201.10|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://dac.lip6.fr/wp-content/uploads/2021/01/expert.zip [following]
--2021-02-27 13:05:04--  https://dac.lip6.fr/wp-content/uploads/2021/01/expert.zip
Connecting to dac.lip6.fr (dac.lip6.fr)|132.227.201.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8377 (8,2K) [application/zip]
Saving to: ‘expert.zip’


2021-02-27 13:05:04 (70,4 MB/s) - ‘expert.zip’ saved [8377/8377]

Archive:  expert.zip
  inflating: expert.pkl              


# Hyper-parameters

In [6]:
freq_test = 1000
freq_opt = 1000
n_iter_max = 200000

env = gym.make('LunarLander-v2')

nbFeatures = env.reset().shape[0]
nbActions = env.action_space.n

# Useful functions

# Fonction d'entraînement (GAIL)

In [None]:
def train(gail, env, n_iter_max, freq_opt = 1000):
    for it in range(n_iter_max):
        if it % freq_test:
            pass
        
        ob = env.reset()
        list_couples_expert = gail.get_couples_expert(freq_opt)
        list_couples_agent = torch.FloatTensor().to(device), torch.FloatTensor().to(device)
        while True:
            # Sample trajectories freq_opt steps
        

# Agent PPO

In [None]:
class ClippedPPO(BaseAgent):
    def __init__(self,
        env, opt, p_layers=[32], v_layers=[32], gamma=0.99, lr=7e-4,
        grad_clip_val=1.0, batch_size=32, clip_ratio=0.01, ent_coef=0.01,
        p_train_steps=32, v_train_steps=32):

        super(ClippedPPO, self).__init__(env, opt)

        # training options
        self.gamma = opt.get('gamma', gamma)
        self.batch_size = opt.get('batch_size', batch_size)
        self.clip_val = opt.get('clipVal', grad_clip_val)
        self.ent_coef = opt.get('entropyCoef', ent_coef)
        self.p_train_steps = opt.get('policyTrainSteps', p_train_steps)
#         self.v_train_steps = opt.get('valueTrainSteps', v_train_steps)
        self.clip_ratio = opt.get('clipRatio', clip_ratio)

        # optimizer options
        self.lr = opt.get('learningRate', lr)

        self.test = False # flag for testing mode

        # policy network
        obs_size, out_size = self.featureExtractor.outSize, env.action_space.n
        p_layers =  opt.get('policyLayers', p_layers)
        self.P = NN(obs_size, out_size, layers=p_layers)

        # value network
#         v_layers = opt.get('valueLayers', v_layers)
#         self.V = NN(obs_size, 1, layers=v_layers)

        # optimizer and value loss
#         self.v_loss_fn = torch.nn.SmoothL1Loss()
        self.p_optim = torch.optim.Adam(self.P.parameters(), self.lr)
#         self.v_optim = torch.optim.Adam(self.V.parameters(), self.lr)

    def learn(self, batch):
        obs, act, adv, tgt, old_logp = batch

        # take K training steps for the policy network
        for _ in range(self.p_train_steps):
            dist, logp = self._compute_policy_dist(obs, act)

            ratio = torch.exp(logp - old_logp)
            clip_adv = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv

            p_loss = -(torch.min(ratio * adv, clip_adv)).mean()
            entropy = torch.mean(dist.entropy())

            self.p_optim.zero_grad()
            loss = p_loss + self.ent_coef * entropy
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.P.parameters(), self.clip_val)
            self.p_optim.step()

        return loss.detach().item()

    def _compute_policy_dist(self, obs, act=None):
        logits = self.P(obs)
        dist = Categorical(logits=logits)
        if act is not None:
            logp = dist.log_prob(act)
            return dist, logp
        return dist, None

    def act(self, observation):
        with torch.no_grad():
            obs = torch.tensor(
                self.featureExtractor.getFeatures(observation),
                dtype=torch.float32)
            value = self.V(obs)
            dist, _ = self._compute_policy_dist(obs)
            action = dist.sample()
            logp = dist.log_prob(action)
        return action.item(), value, logp


# Architectures

In [27]:
class Discriminator(nn.Module):
    def __init__(self, input_size):
        super(Discriminator, self).__init__()
        self.input_size = input_size
        self.main = nn.Sequential(
            nn.Linear(input_size, 100),
            nn.Tanh(),
            nn.Linear(100, 100),
            nn.Tanh(),
            nn.Linear(100, 1),
            nn.Sigmoid()
        )
    def forward(x):
        return self.main(x)

    
class Generator(nn.Module):
    def __init__(self, input_size, output_size):
        super(Generator, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.main = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh(),
            nn.Linear(32, output_size),
            nn.Tanh()
        )
    def forward(x):
        return self.main(x)

class GAIL(nn.Module):
    def __init__(self, nbFeatures, nbActions, expert_file):
        super(GAIL, self).__init__()
        self.nbFeatures = nbFeatures
        self.nbActions = nbActions
        self.discriminator = Discriminator(nbFeatures+nbActions)
        self.generator = Generator(nbFeatures, nbActions)
        self.floatTensor = torch.FloatTensor().to(device)
        self.longTensor = torch.LongTensor().to(device)
        self.loadExpertTransitions(expert_file)
        self.to(device)
        
        
    def loadExpertTransitions(self,file):
        with open(file,'rb') as handle:
            expert_data = pickle.load(handle).to(self.floatTensor)
            expert_states = expert_data[:,:self.nbFeatures]
            expert_actions = expert_data[:,self.nbFeatures:]
            self.expert_states = expert_states.contiguous()
            self.expert_actions = expert_actions.contiguous()

    def toOneHot(self, actions) :
        actions = actions.view(-1).to(self.longTensor)
        oneHot = torch.zeros(actions.size()[0],self.nbActons).to(self.floatTensor)
        oneHot[range(actions.size()[0]),actions]=1
        return oneHot

    def toIndexAction(self, oneHot):
        ac = self.longTensor.new(range(self.nbActions)).view(1,-1)
        ac = ac.expand(oneHot.size()[0],-1).contiguous().view(-1)
        actions = ac[oneHot.view(-1)>0].view(-1)
        return actions
    
    def discriminate(self, x):
        x = x.to(self.floatTensor)
        return self.discriminator(x)
    
    def get_couples_expert(self, n_couples):
        # choose n_couples idx
        indices = np.random.choice(self.expert_states.size(0), n_couples)
        
        return self.expert_states[indices], self.expert_actions[indices]
        
    def act(self, ob):
        x = x.to(self.floatTensor)
        return self.generator(x)

# Entraînement du Agent

In [28]:
device

device(type='cuda')

In [29]:
model_gail = GAIL(nbFeatures=nbFeatures, nbActions=nbActions, expert_file='expert.pkl')

In [30]:
model_gail.get_couples_expert(10)

(tensor([[ 2.8157e-01,  3.6903e-01,  1.4407e-01, -6.7590e-01,  2.5916e-01,
           6.5700e-03,  0.0000e+00,  0.0000e+00],
         [ 1.9798e-01, -3.1209e-02, -1.3679e-02,  6.2756e-02, -1.6328e-01,
           1.3169e-02,  1.0000e+00,  1.0000e+00],
         [ 2.5586e-01, -3.3138e-02,  9.4720e-02, -1.0189e-02, -2.3021e-01,
          -3.0590e-02,  1.0000e+00,  1.0000e+00],
         [ 2.0318e-01, -2.0744e-02,  3.6185e-02, -3.7013e-03, -1.4336e-01,
          -1.1626e-02,  1.0000e+00,  1.0000e+00],
         [ 2.2209e-01, -2.6553e-02, -2.9242e-01, -8.9775e-02, -3.5659e-01,
          -4.3643e-01,  1.0000e+00,  0.0000e+00],
         [ 2.2963e-01, -2.6714e-02,  7.8331e-02, -8.2146e-03, -1.8931e-01,
          -2.5197e-02,  1.0000e+00,  1.0000e+00],
         [ 2.7826e-01, -3.8508e-02,  1.2366e-01, -1.3571e-02, -2.6655e-01,
          -4.0081e-02,  1.0000e+00,  1.0000e+00],
         [ 3.0737e-01, -4.7081e-02,  1.3407e-01, -2.8179e-02, -3.0453e-01,
          -4.1337e-08,  1.0000e+00,  1.0000e+00],
