Re-implementing the GAIL algorithm on the expert data acquired to run CartPole-v1

In [1]:
import gym
import numpy as np
import os
import gym
import pickle
import argparse
import numpy as np
from collections import deque

import torch
import torch.optim as optim
from tensorboardX import SummaryWriter 
from utils.utils import *
from utils.zfilter import *
import torch.nn as nn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x1096c9550>

In [3]:
class Actor(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_outputs)
        
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        mu = self.fc3(x)
        logstd = torch.zeros_like(mu)
        std = torch.exp(logstd)
        return mu, std


class Critic(nn.Module):
    def __init__(self, num_inputs, hidden_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        v = self.fc3(x)
        return v


class Discriminator(nn.Module):
    def __init__(self, num_inputs, hidden_size):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
      #  self.fc3.weight.data.mul_(0.1)
     #   self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        prob = torch.sigmoid(self.fc3(x))
        return prob

In [4]:
demonstrations = np.load('../gail/CartPoleExpert.npy')

In [5]:
demonstrations.shape

(239724, 5)

In [9]:
demonstrations.shape

(2000, 3)

In [14]:
action

array([0.])

In [13]:
env = gym.make("CartPole-v1")
env.reset()
for i in range(demonstrations.shape[0]):
        action = demonstrations[i,-1:]
        s, r, done, _ = env.step(int(action[0]))
        env.render()
        if done:
            env.close()
            env.reset()
        
       # break
        #(state[0], state[1], action)
    
env.close()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/noahkasmanoff/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-e78dc4593bed>", line 6, in <module>
    env.render()
  File "/Users/noahkasmanoff/anaconda3/lib/python3.7/site-packages/gym/core.py", line 240, in render
    return self.env.render(mode, **kwargs)
  File "/Users/noahkasmanoff/anaconda3/lib/python3.7/site-packages/gym/envs/classic_control/cartpole.py", line 213, in render
    return self.viewer.render(return_rgb_array=mode == 'rgb_array')
  File "/Users/noahkasmanoff/anaconda3/lib/python3.7/site-packages/gym/envs/classic_control/rendering.py", line 127, in render
    self.window.flip()
  File "/Users/noahkasmanoff/anaconda3/lib/python3.7/site-packages/pyglet/window/cocoa/__init__.py", line 289, in flip
    self.context.flip()
  File "/Users/noahkasmanoff/anaconda3/lib/python3.7/site-packages/pyglet/gl

KeyboardInterrupt: 

In [15]:
#code for training 
import torch
import numpy as np
from utils.utils import get_entropy, log_prob_density

def train_discrim(discrim, memory, discrim_optim, demonstrations, discrim_update_num, clip_param):
    """
    Training the discriminator. 

    Why? 
    Goal of discrim is to do bce loss on learner and expert.

    Should go through step by step, but the idea is to make learner look bad? That way it improves  

    """
    memory = np.array(memory)  # s a r s' tuple
    states = np.vstack(memory[:, 0]) 
    actions = list(memory[:, 1]) #actions taken by actor/policy

    states = torch.Tensor(states)
    actions = torch.Tensor(actions)
        
    criterion = torch.nn.BCELoss() # classify
 
    for _ in range(discrim_update_num):
        learner = discrim(torch.cat([states, actions], dim=1)) #pass (s,a) through discriminator
        demonstrations = torch.Tensor(demonstrations) # pass (s,a) of expert through discriminator
        expert = discrim(demonstrations) #discrimator "guesses" whether or not these 
        # actions came from expert or learner

        discrim_loss = criterion(learner, torch.ones((states.shape[0], 1))) + \
                        criterion(expert, torch.zeros((demonstrations.shape[0], 1)))
                # discrim loss: predict agent is all wrong, get as close to 0, and predict expert is 1, getting as close to 1 as possible. 
        discrim_optim.zero_grad() # gan loss, it tries to always get it right. 
        discrim_loss.backward()
        discrim_optim.step()

        # take these steps, do it however many times specified. 

    expert_acc = ((discrim(demonstrations) < 0.5).float()).mean() #how often it realized the fake examples were fake
    learner_acc = ((discrim(torch.cat([states, actions], dim=1)) > 0.5).float()).mean() #how often if predicted expert correctly. 

    return expert_acc, learner_acc # accuracy, it's the same kind, but because imbalanced better to look at separately. 
 

def train_actor_critic(actor, critic, memory, actor_optim, critic_optim, actor_critic_update_num, batch_size, clip_param):
    """
    Using get gae, this is basically ppo . 

    It's somewhat straightforward, and trained with the irl reward which is 
    from that memory versus what would usually be the real rewa
    """
    memory = np.array(memory) 
    # tuple of a regular old RL problem, but now reward is what the discriminator says. 
    states = np.vstack(memory[:, 0]) 
    actions = list(memory[:, 1]) 
    rewards = list(memory[:, 2])  #IRL Rewards? yes. 
    masks = list(memory[:, 3]) 

    # compute value of what happened, see if what we can get ius better. 
    old_values = critic(torch.Tensor(states))
    #GAE aka estimate of Value + actual return roughtly 
    returns, advants = get_gae(rewards, masks, old_values, gamma, lamda)
    
    # pass states through actor, get corresponding actions
    mu, std = actor(torch.Tensor(states))
    # new mus and stds? 
    old_policy = log_prob_density(torch.Tensor(actions), mu, std) # sum of log probability
    # of old actions

    criterion = torch.nn.MSELoss()
    n = len(states)
    arr = np.arange(n)

    for _ in range(actor_critic_update_num):
        np.random.shuffle(arr)

        for i in range(n // batch_size): 
            batch_index = arr[batch_size * i : batch_size * (i + 1)]
            #batch_index = torch.LongTensor(batch_index)
            
            inputs = torch.Tensor(states)[batch_index]
            actions_samples = torch.Tensor(actions)[batch_index]
            returns_samples = returns.unsqueeze(1)[batch_index]
            advants_samples = advants.unsqueeze(1)[batch_index]
            oldvalue_samples = old_values[batch_index].detach()
        
        
            values = critic(inputs) #
            clipped_values = oldvalue_samples + \
                             torch.clamp(values - oldvalue_samples,
                                         -clip_param, 
                                         clip_param)
            critic_loss1 = criterion(clipped_values, returns_samples)
            critic_loss2 = criterion(values, returns_samples)
            critic_loss = torch.max(critic_loss1, critic_loss2).mean()

            loss, ratio, entropy = surrogate_loss(actor, advants_samples, inputs,
                                         old_policy.detach(), actions_samples,
                                         batch_index)
            clipped_ratio = torch.clamp(ratio,
                                        1.0 - clip_param,
                                        1.0 + clip_param)
            clipped_loss = clipped_ratio * advants_samples
            actor_loss = -torch.min(loss, clipped_loss).mean()
            #print(actor_loss,critic_loss,entropy)
           # return actor_loss, critic_loss, entropy
            loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
           # asdf
            #loss = loss.mean() #TODO
            actor_optim.zero_grad()
            loss.backward()
            actor_optim.step()

           # critic_optim.zero_grad()
           # loss.backward() 
            critic_optim.step()

           # loss.zero_grad()

def get_gae(rewards, masks, values, gamma, lamda):
    """
    How much better a particular action is in a particular state. 
    
    Uses reward of current action + value function of that state-action pair, discount factor gamma, and then lamda to compute. 
    """
    rewards = torch.Tensor(rewards)
    masks = torch.Tensor(masks)
    returns = torch.zeros_like(rewards)
    advants = torch.zeros_like(rewards)
    
    running_returns = 0
    previous_value = 0
    running_advants = 0

    for t in reversed(range(0, len(rewards))):
        running_returns = rewards[t] + (gamma * running_returns * masks[t])
        returns[t] = running_returns

        running_delta = rewards[t] + (gamma * previous_value * masks[t]) - \
                                        values.data[t]
        previous_value = values.data[t]
        
        running_advants = running_delta + (gamma * lamda * \
                                            running_advants * masks[t])
        advants[t] = running_advants

    advants = (advants - advants.mean()) / advants.std()
    return returns, advants

def surrogate_loss(actor, advants, states, old_policy, actions, batch_index):
    """
    The loss for PPO. Re-run through network, recomput policy from states
    and see if this surrogate ratio is better. If it is, use as proximal policy update. It's very close to prior policy, but def better. 
    
    Not sure this actually works though. Should not the new mu and stds be used to draw,
    
        When do we use get_action? Only once in main, I think it should be for all? 
    """
    mu, std = actor(states)
    new_policy = log_prob_density(actions, mu, std)
    old_policy = old_policy[batch_index]

    ratio = torch.exp(new_policy - old_policy)
    surrogate_loss = ratio * advants
    entropy = get_entropy(mu, std)

    return surrogate_loss, ratio, entropy

In [25]:
# Normally args but not here :-)
env_name = 'CartPole-v1'
load_model = None
seed = 0
render = False
gamma = 0.99
lamda = .98
hidden_size = 64

learning_rate = 3e-4
clip_param = .2
discrim_update_num = 2
actor_critic_update_num = 10
l2_rate = 1e-3 # weight decay
total_sample_size = 256 # total num of batches to collect before learning
batch_size = 16
suspend_accu_exp = .9 # do not need to be this high typically, but seems likely it has to be for a simple env like mountain car cont.
suspend_accu_gen = .9
max_iter_num = 400
seed = 42
logdir = 'logs'

env = gym.make(env_name)
    
env.seed(seed)
torch.manual_seed(seed)

num_inputs = 4
num_actions = 1
running_state = ZFilter((num_inputs,), clip=5) # huh? 
# oh wow. ZFilter is exactly what I do in capstone project, removing "badtimes"

print('state size:', num_inputs) 
print('action size:', num_actions)

#load agent stuff 
actor = Actor(num_inputs, num_actions, hidden_size)
critic = Critic(num_inputs, hidden_size)
discrim = Discriminator(num_inputs + num_actions, hidden_size)

actor_optim = optim.Adam(actor.parameters(), lr=learning_rate)
critic_optim = optim.Adam(critic.parameters(), lr=learning_rate, 
                          weight_decay=l2_rate) 
discrim_optim = optim.Adam(discrim.parameters(), lr=learning_rate)

state size: 4
action size: 1


In [26]:
#train_discrim_flag = False

In [27]:
# load demonstrations
#expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
#demonstrations = np.load('/Users/noahkasmanoff/Desktop/Projects/lets-do-irl/mountaincar/app/expert_demo/expert_demo.npy')
print("demonstrations.shape", demonstrations.shape)

writer = SummaryWriter(logdir)

#if you aren't starting from scratch, load in this 
if load_model is not None:
    saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(load_model))
    ckpt = torch.load(saved_ckpt_path)

    # initialize everything
    actor.load_state_dict(ckpt['actor'])
    critic.load_state_dict(ckpt['critic'])
    discrim.load_state_dict(ckpt['discrim'])

    running_state.rs.n = ckpt['z_filter_n']
    running_state.rs.mean = ckpt['z_filter_m']
    running_state.rs.sum_square = ckpt['z_filter_s']

    print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

# if no old model no worries, start training. 
episodes = 0
train_discrim_flag = True
render = True
for iter in range(max_iter_num):
    # for i total trajectories 
    actor.eval(), critic.eval()
    memory = deque()

    steps = 0
    scores = []

    while steps < total_sample_size: 
        # sample trajectories  (batch size)
        state = env.reset()
        score = 0

        state = running_state(state) #uh.. again ZFilter related, cleans the state 
        epsteps = 0
        for _ in range(10000): 
            epsteps += 1
            #run through environment
            if render: 
                env.render()

            steps += 1

            mu, std = actor(torch.Tensor(state).unsqueeze(0)) #pass state through actor network
            action = get_action(mu, std)[0] #compute random action
            next_state, reward, done, _ = env.step(action) #take a step
            irl_reward = get_reward(discrim, state, action) #infer what the reward of this action is based on discriminator's get reward 

            if done:
                mask = 0
            else:
                mask = 1 #if done, save this, 

            memory.append([state, action, irl_reward, mask])

            next_state = running_state(next_state) #save cleaned next state
            state = next_state #and set to current state, 

            score += irl_reward #add total reward
           # print("IRL Reward=",irl_reward)
            if done:
                break
            #actual sampling done here 



        episodes += 1
        scores.append(score)

    score_avg = np.mean(scores) #how this model did, 
    print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
    writer.add_scalar('log/score', float(score_avg), iter) #logg

    actor.train(), critic.train(), discrim.train() #now train 
    if train_discrim_flag: #if this batch optimizes discrim/reward, 
        # for training the discriminator, classify where state-action pair came from. 
        expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, discrim_update_num, clip_param) # see comments in train_model. 
        print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
        if expert_acc > suspend_accu_exp and learner_acc > suspend_accu_gen:
            print("Now it will only train the policy, seeing as it is good enough at finding the differences between learner and expert trajectories.")
            train_discrim_flag = False #now restart, train policy. 
    #for training actor critic 
    
    # PPO operation, 
    train_actor_critic(actor, critic, memory, actor_optim, critic_optim, actor_critic_update_num, batch_size, clip_param) # no output, see comments in train_model 

    if iter % 100:
        score_avg = int(score_avg)

        model_path = os.path.join(os.getcwd(),'save_model')
        if not os.path.isdir(model_path):
            os.makedirs(model_path)

        ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

        save_checkpoint({
            'actor': actor.state_dict(),
            'critic': critic.state_dict(),
            'discrim': discrim.state_dict(),
            'z_filter_n':running_state.rs.n,
            'z_filter_m': running_state.rs.mean,
            'z_filter_s': running_state.rs.sum_square,
           # 'args': args,
            'score': score_avg
        }, filename=ckpt_path)

demonstrations.shape (239724, 5)
0:: 28 episode score is 6.55


  app.launch_new_instance()


Expert: 27.45% | Learner: 49.05%




1:: 56 episode score is 6.62
Expert: 49.51% | Learner: 45.42%


KeyboardInterrupt: 

In [28]:
a

array([-0.12179299], dtype=float32)

High reward signal for getting stuck might seem bad, but the discriminator is never going to provide a negative reward value, but instead finds a way to penalize this activity by the fact the the loss is an average over states. By this we know the networks's J (loss) is going to be much lower, and therefore less learning signal via gradient magnitude is sent back. This is a good thing and means more info is given backward for shorter episodes. 

Now that the discriminator has finished training, let's compare how it's reward function looks next to the actual one. 

How should I do this? 

Run trajectories below, print out IRL reward next to reward


In [24]:
%debug

> [0;32m/Users/noahkasmanoff/anaconda3/lib/python3.7/site-packages/torch/nn/functional.py[0m(1674)[0;36mlinear[0;34m()[0m
[0;32m   1672 [0;31m    [0;32mif[0m [0minput[0m[0;34m.[0m[0mdim[0m[0;34m([0m[0;34m)[0m [0;34m==[0m [0;36m2[0m [0;32mand[0m [0mbias[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1673 [0;31m        [0;31m# fused op is marginally faster[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1674 [0;31m        [0mret[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0maddmm[0m[0;34m([0m[0mbias[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mweight[0m[0;34m.[0m[0mt[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1675 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1676 [0;31m        [0moutput[0m [0;34m=[0m [0minput[0m[0;34m.[0m[0mmatmul[0m[0;34m([0m[0mweight[0m[0;34m.[0m[0mt[0m[0;34m([0m[0;34m)[0m[0;34

In [39]:


env = gym.make('MountainCarContinuous-v0')
state = env.reset()

In [40]:
state = env.reset()
done = False
while not done:
        
    mu, std = actor(torch.Tensor(state))
    a = np.tanh(get_action(mu,std))
    state, reward, done, _ = env.step(a)
    print(reward,a)
    env.render()

-0.010128668594895009 [-0.3182557]
-0.011433949327145943 [-0.33814123]
-0.07358932520333497 [-0.8578422]
-0.006230006735599147 [0.24959981]
-0.09467084502820314 [-0.97298944]
-7.010148077788186e-08 [0.00083727]
-0.028237040310546392 [-0.53138536]
-0.09006050744808505 [-0.94900215]
-0.002587011107891968 [-0.16084188]
-0.07979764717165382 [-0.8932953]
-0.05838156776769985 [0.7640783]
-0.055779981136567264 [-0.74685997]
-0.08811914108885191 [-0.93871796]
-0.008268106184808577 [-0.28754315]
-0.016793513053393473 [-0.4097989]
-0.09886795624406695 [-0.9943237]
-0.047269370481064145 [0.68752724]
-0.04067368082551184 [-0.6377592]
-0.03914723665379292 [-0.6256775]
-0.012498435972990142 [-0.35353127]
-0.012469840642325725 [-0.35312662]
-0.0933506723110952 [-0.9661815]
-0.01942942981258966 [-0.44078827]
-0.03227584412184683 [-0.56811833]
-0.02422802491247902 [-0.49221972]
-0.000492437684466851 [0.0701739]
-0.07767747418260029 [-0.88134825]
-0.040068434205073805 [0.6329963]
-0.012853319351984284 [

In [None]:
env.action_space.sample()

In [None]:
a