# Implementation for:

- Behavioral Cloning (BC) <a href="https://arxiv.org/abs/1608.00627v1">paper link</a>

## Author: Montaser Mohammedalamen

## Installation 

In [None]:
! pip install gym
! pip install box2d-py
! pip install torchvision

## Import libraries 

In [None]:
import numpy as np 
import gym
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from IPython import display
from matplotlib import style
plt.style.use("ggplot")

In [None]:
# init environment
env_name='BipedalWalker-v2'
env = gym.make(env_name)
action_space_size = env.action_space.shape[0]
state_space_size  = env.observation_space.shape[0]

In [None]:
# Load Expert data (states and actions for BC, States only for BCO)
expert_states  = torch.tensor(np.load("states_expert_walker2d.npy"), dtype=torch.float)
expert_actions = torch.tensor(np.load("actions_expert_walker2d.npy"), dtype=torch.float)
print("expert_states", expert_states.shape)
print("expert_actions", expert_actions.shape)            

In [None]:
def to_input (states, actions,  n=2, compare=1):
    '''
    Data preperpation and filtering 
    Inputs:
    states: expert states as tensor
    actions: actions states as tensor
    n: window size (how many states needed to predict the next action)
    compare: for filtering data 
    return:
    output_states: filtered states as tensor 
    output_actions: filtered actions as tensor 
    '''
    count=0
    index= []
    ep, t, state_size = states.shape
    _, _, action_size = actions.shape
    
    output_states = torch.zeros((ep*(t-n+1) , state_size*n), dtype = torch.float)
    output_actions = torch.zeros((ep*(t-n+1) , action_size), dtype = torch.float)
    
    for i in range (ep):
        for j in range (t-n+1):
            if (states[i, j] == -compare*torch.ones(state_size)).all() or (states[i, j+1] == -compare*torch.ones(state_size)).all():
                index.append([i,j])
            else:
                output_states[count] = states[i, j:j+n].view(-1)
                output_actions[count] = actions[i,j]
                count+=1
    output_states= output_states[:count]
    output_actions= output_actions[:count]
    
    return output_states, output_actions

In [None]:
# selecting number expert trajectories from expert data
number_expert_trajectories = 50
a= np.random.randint(expert_states.shape[0] - number_expert_trajectories)
print(a)
expert_state, expert_action = to_input (expert_states[a : a+number_expert_trajectories], expert_actions[a : a+number_expert_trajectories], n=2,  compare=5)
print("expert_state", expert_state.shape)
print("expert_action", expert_action.shape)

# 1- Behavioral Cloning (BC)

In [None]:
# concatenate expert states and actions, divided into 70% training and 30% testing

new_data = np.concatenate((expert_state[:,: state_space_size], expert_action), axis=1)
np.random.shuffle(new_data)
new_data = torch.tensor(new_data, dtype=torch.float)
n_samples = int(new_data.shape[0]*0.7)
training_set = new_data[:n_samples]
testing_set = new_data[n_samples:]
print("training_set", training_set.shape)
print("testing_set", testing_set.shape)

In [None]:
# Network arch Behavioral Cloning , loss function and optimizer
bc_walker =  nn.Sequential(
    nn.Linear(state_space_size,40),
    nn.ReLU(),
    
    nn.Linear(40,80),
    nn.ReLU(),
    
    nn.Linear(80,120),
    nn.ReLU(),
    
    nn.Linear(120,100),
    nn.ReLU(),
    
    nn.Linear(100,40),
    nn.ReLU(),
    
    nn.Linear(40,20),
    nn.ReLU(),
    
    
    nn.Linear(20,action_space_size),
)
criterion = nn.MSELoss()
learning_rate = 0.01
optimizer = torch.optim.Adam(bc_walker.parameters(), lr = learning_rate) 

In [None]:
loss_list = []
test_loss = []
batch_size = 256
n_epoch = 100
learning_rate = 0.001
optimizer = torch.optim.Adam(bc_walker.parameters(), lr = learning_rate) 
for itr in range(n_epoch):
    total_loss = 0
    b=0
    for batch in range (0,training_set.shape[0], batch_size):
        data   = training_set  [batch : batch+batch_size , :state_space_size]
        y      = training_set [batch : batch+batch_size, state_space_size:]
        y_pred = bc_walker(data)
        loss   = criterion(y_pred, y)
        total_loss += loss.item() 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        b += 1
    print("[EPOCH]: %i, [MSE LOSS]: %.6f" % (itr+1, total_loss / b))
    display.clear_output(wait=True)
    loss_list.append(total_loss / b)
    x = testing_set[:, :state_space_size]
    y = testing_set[:,state_space_size:]
    y_pred = bc_walker(x)
    test_loss.append(criterion(y_pred, y).item())

In [None]:
# plot test loss
# torch.save(bc_walker, "bc_walker_n=2") # uncomment to save the model 
plt.plot(test_loss, label="Testing Loss")
plt.xlabel("iterations")
plt.ylabel("loss")
plt.legend()
plt.show()

## Test inferred actions with real actions

In [None]:
p = 87 # select any point to test the model
print( bc_walker(testing_set[p, :state_space_size]) )
print(testing_set[p, state_space_size:])
criterion( bc_walker(testing_set[p, :state_space_size] ), testing_set[p, state_space_size:] ).item()

## Test BC model in Environment

In [None]:
################################## parameters ##################################
n=2 # window size
n_iterations = 5 # max number of interacting with environment
n_ep = 1000 # number of epoches
max_steps = 500 # max timesteps per epoch
gamma = 1.0 # discount factor
seeds = [684, 559, 629, 192, 835] # random seeds for testing
################################## parameters ##################################

seed_reward_mean = []
seed_reward  = []
for itr in range (n_iterations):
   ################################## interact with env ##################################
    G= []
    G_mean = []
    env.seed(int(seeds[itr]))
    torch.manual_seed(int(seeds[itr]))
    torch.cuda.manual_seed_all(int(seeds[itr]))

    for ep in range (n_ep):
        state = env.reset()
        rewards = []
        R=0
        for t in range (max_steps):      
            action = bc_walker(torch.tensor(state, dtype=torch.float))
            action = np.clip(action.detach().numpy(), -1,1)
            next_state , r, done, _   = env.step(action)
            rewards.append(r)
            state = next_state
            if done:
                break
        R = sum([rewards[i]*gamma**i for i in range (len(rewards))])
        G.append(R)
        G_mean.append(np.mean(G))
        if ep % 1 ==0:
            print("ep = {} , Mean Reward = {:.6f}".format(ep, R))
        display.clear_output(wait=True)
    seed_reward.append(G)
    seed_reward_mean.append(G_mean)
    print("Itr = {} overall reward  = {:.6f} ".format(itr, np.mean(seed_reward_mean[-1])))
    print("Interacting with environment finished")
env.close()
# np.save("reward_mean_walker_bc1_expert_states={}".format(new_data.shape[0]), seed_reward_mean) #uncomment to save reward over 5 random seeds

In [None]:
seed_reward_mean_bc = np.array(seed_reward_mean)
mean_bc  = np.mean(seed_reward_mean_bc,axis=0)
std_bc  = np.std(seed_reward_mean_bc,axis=0)

## Expert

In [None]:
TRPO_mean  = np.load("reward_mean_expert_walker_TRPO.npy")
mean_expert= np.mean(TRPO_mean,axis=0)
std_expert = np.std(TRPO_mean,axis=0)

## Random

In [None]:
random_mean  = np.load("reward_mean_walker_random.npy")
mean_random= np.mean(random_mean,axis=0)
std_random  = np.std(random_mean,axis=0)

In [None]:
# Scaled performance
def scaled (x, min_value, max_value):
    return (x - min_value) / (max_value - min_value)

bc_score  = scaled( mean_bc[-1] , mean_random[-1] , mean_expert[-1] )

## Compare BC VS Expert VS Random

In [None]:
x = np.arange(1000)

plt.plot(x, mean_expert, "-", label="Expert")
plt.fill_between(x, mean_expert+std_expert, mean_expert-std_expert, alpha=0.2)

plt.plot(x, mean_bc, "-", label="BC")
plt.fill_between(x, mean_bc + std_bc, mean_bc - std_bc, alpha=0.2)

plt.plot(x, mean_random, "-", label="Random")
plt.fill_between(x, mean_random+std_random, mean_random-std_random, alpha=0.2)

plt.xlabel("Episodes")
plt.ylabel("Mean Reward")
plt.title("Random VS Expert VS BC in Walker")
plt.legend()