## Import

In [32]:
from qibullet import SimulationManager
from qibullet import NaoVirtual , NaoFsr
import matplotlib.pyplot as plt
import numpy as np
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import pickle
import random
import matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

<contextlib.ExitStack at 0x7f740dd65970>

In [33]:
if torch.cuda.is_available():
  device = "cuda" 
else:
  device = "cpu"
print(device, " in use")

cpu  in use


## Nao Bullet

In [34]:

# Launch Simulation Environment
vis = True
simulation_manager = SimulationManager()
nao_sim = simulation_manager.launchSimulation(gui=False, auto_step=True)
#nao_sim_vis = simulation_manager.launchSimulation(gui=False, auto_step=True)
simulation_manager.setGravity(nao_sim, [0.0, 0.0, -9.81])
nao = None

# Utility Functions
# Joints to monitor
legJoints = [ "LHipRoll", "LHipPitch", "LKneePitch", "LAnklePitch", "LAnkleRoll",
              "RHipRoll", "RHipPitch", "RKneePitch", "RAnklePitch", "RAnkleRoll"]

min_values_tensor = torch.tensor([-0.37, -1.53, -0.09, -1.18, -0.39 , -0.79, -1.53, -0.10, -1.18, -0.76], dtype=torch.float32)
max_values_tensor = torch.tensor([ 0.79, 0.48, 2.11, 0.92, 0.76,  0.37, 0.48, 2.12, 0.93, 0.39], dtype=torch.float32)

min_values = [-0.38, -1.539, -0.099,	-1.19 , -0.399 ,	-0.792, -1.536, -0.105, -1.188, -0.769]
max_values = [ 0.795, 0.488, 2.118,0.923, 0.77,  0.38, 0.49, 2.13, 0.95,0.399]
bucket_width = 0.1
# Compute the number of buckets for each element of the array
num_buckets = np.ceil((np.array(max_values) - np.array(min_values)) / bucket_width).astype(int)

# Create the list of bucket edges for each element of the array
bucket_edges_list = [np.linspace(start=min_value, stop=max_value, num=num_bucket+1) for min_value, max_value, num_bucket in zip(min_values, max_values, num_buckets)]


nao_prev_position_X = 0
nao_prev_position_Y = 0

prev_weight_cntr = 0
def env_Reward():
    global prev_weight_cntr
    x, y, z = nao.getPosition()
    terminated = False
    delX = 5*(nao_prev_position_X - x )
    delY = 5*(nao_prev_position_Y - y )
    weight = -(nao.getTotalFsrValues(NaoFsr.LFOOT) + nao.getTotalFsrValues(NaoFsr.RFOOT) )

    # if robot fallen down
    if weight == 0: 
        reward = -1
        prev_weight_cntr += 1
    else:
        reward = delX
        prev_weight_cntr = 0

    if prev_weight_cntr == 5:
        terminated = True
        prev_weight_cntr = 0
    return reward, terminated

def env_state():
    return nao.getAnglesPosition(legJoints)
    
def env_action(angles, speed):
    nao.setAngles(legJoints, angles ,speed)

def env_itrm_step(itr):
    for _ in range(itr):
        simulation_manager.stepSimulation(nao_sim)

def env_reset():
    global nao_prev_position_X , nao_prev_position_Y
    global nao
    simulation_manager.resetSimulation(nao_sim)
    simulation_manager.setGravity(nao_sim, [0.0, 0.0, -9.81])
    nao = simulation_manager.spawnNao( 
    nao_sim,
    translation=[0, 0, 0],quaternion=[0, 0, 0, 1],
    spawn_ground_plane=True)
    env_itrm_step(100)
    nao_prev_position_X, nao_prev_position_Y, _ = nao.getPosition()

def env_stop():
    simulation_manager.stopSimulation(nao_sim)
    # wait for limited iterations 

def env_step(angles):
    env_action(angles, 0.5)
    #env_itrm_step(50)
    time.sleep(0.3)
    reward, terminated = env_Reward()
    next_state = env_state()
    return next_state, reward, terminated

def env_Q_state():
    my_array = env_state()
    # Define a function to assign each element of the array to a bucket
    def assign_bucket(value):
        index = np.where(value == my_array)[0][0]  # Find the index of the value in the array
        bucket_edges = bucket_edges_list[index]   # Get the corresponding bucket edges
        bucket_index = np.searchsorted(bucket_edges, value, side='right') - 1  # Find the bucket index
        return bucket_index

    my_array = np.array(my_array)

    bucket_indices = [assign_bucket(value) for value in my_array]
    return bucket_indices
    
env_reset()


In [27]:
env_stop()

## Agent

In [41]:

## Agent 
class walk_Agent:
  def __init__(self, observation_space, action_space , epsilon, epsilon_decay):
    self.observation_space = observation_space
    self.action_space = action_space
    self.epsilon = epsilon
    self.epsilon_decay = epsilon_decay
    self.epsilon_array = list()
    self.epsilon_min = 0.35

  def step(self, obs, policy, Q):
    self.epsilon = max ( self.epsilon * self.epsilon_decay , self.epsilon_min)
    self.epsilon_array.append(self.epsilon)

    #obs = np.array(obs)
    #obs = list( obs.flatten())
    if policy == "e-greedy":
      if np.random.uniform() > self.epsilon:
        next_obs = []
        act = []
        for i in range(self.observation_space):
          idx = np.argmax( [ Q[3*i] , Q[3*i + 1], Q[3*i + 2] ])
          if idx == 0:
            next_obs.append( obs[i] + 0.1)
            act.append(0)
          elif idx == 1:
            next_obs.append( obs[i] )
            act.append(1)
          elif idx == 2:
            next_obs.append( obs[i] - 0.1)
            act.append(2)
          
      else:
        act = []
        next_obs = []
        for elem in obs:
          elem += random.choice([ 0.1, 0, -0.1])
          if elem == 0.1:
            act.append(0)
          elif elem == 0:
            act.append(1)
          else:
            act.append(2)
          next_obs.append(elem)
        
      return next_obs , act

    elif policy == "random":
      return np.random.choice(self.action_space.n)

    elif policy == "optimal":
      return np.argmax(Q)
  
  def give_epsilon_history(self):
    return self.epsilon_array

## DQN

In [42]:
### Paramters
epsilon = 1
gamma = 0.9
epsilon_decay = 0.999

observation_space = 10
action_space= 20

## Environment Initialisation
agent = walk_Agent(observation_space, action_space, epsilon, epsilon_decay)

env_reset()
terminated = False

# BEGIN_YOUR_CODE
observation = env_Q_state()
observation = np.array([observation], dtype='float32')

# Define Action Value model
class DQN_AV(nn.Module):
    def __init__(self):
        super().__init__()
        self.network_arch = nn.Sequential(
            nn.Linear(10, 64),
            nn.ReLU(),
            nn.Linear(64, 30)
        )

    def forward(self, x):
        out = self.network_arch(x)
        return out

AV = DQN_AV().to(device)
#print(model)


# Define Target Action model
class DQN_TA(nn.Module):
    def __init__(self):
        super().__init__()
        self.network_arch = nn.Sequential(
            nn.Linear(10, 64),
            nn.ReLU(),
            nn.Linear(64, 30)
        )

    def forward(self, x):
        out = self.network_arch(x)
        return out

TA = DQN_TA().to(device)
#print(model)

TA.load_state_dict(AV.state_dict())

input = torch.from_numpy(observation)
print("Observation : " , observation)

out = AV(input)
print("Q Values from Action Value : ", out.detach().numpy())

out = TA(input)
print("Q Values from Target Action : ", out.detach().numpy())

Observation :  [[ 5. 17.  0. 13.  3.  6. 17.  0. 13.  9.]]
Q Values from Action Value :  [[ 7.5337780e-01 -1.7502315e+00 -1.1084392e+00  2.6178451e+00
  -3.0137923e+00  2.6336172e+00  6.4629722e-01  1.6161565e+00
  -1.5202831e+00  2.9719837e+00  1.2684149e+00  3.1629076e+00
   2.1339784e+00 -1.4091963e+00  5.4755741e-01 -3.1435776e+00
   1.7057283e+00 -2.1219053e+00 -6.3218683e-01  7.8435099e-01
  -1.8975660e+00  1.4886105e+00 -6.9488382e+00  1.5880821e+00
  -7.3648386e+00 -8.9088053e-01  5.6162828e-01 -2.2720780e+00
  -3.9497465e-03 -3.6295075e+00]]
Q Values from Target Action :  [[ 7.5337780e-01 -1.7502315e+00 -1.1084392e+00  2.6178451e+00
  -3.0137923e+00  2.6336172e+00  6.4629722e-01  1.6161565e+00
  -1.5202831e+00  2.9719837e+00  1.2684149e+00  3.1629076e+00
   2.1339784e+00 -1.4091963e+00  5.4755741e-01 -3.1435776e+00
   1.7057283e+00 -2.1219053e+00 -6.3218683e-01  7.8435099e-01
  -1.8975660e+00  1.4886105e+00 -6.9488382e+00  1.5880821e+00
  -7.3648386e+00 -8.9088053e-01  5.61628

## Train

In [43]:
Q = np.zeros((10, 30))
QT = np.zeros((10, 30))

### Paramters
epsilon = 1
gamma = 0.9
epsilon_decay = 0.99999

policy = "e-greedy"

D = list()
avg_reward = 0
Cum_reward_arr = list()
no_of_timestep = list()

## Environment Initialisation
agent = walk_Agent(observation_space, action_space, epsilon, epsilon_decay)


## Reset
env_reset()
obs = env_Q_state()
obs = np.array([obs], dtype='float32')
terminated = False


## Parameters
miniBatch = 32
C = 10
episodes = 10
M = 500
learning_rate = 0.01

optimizer = optim.AdamW(AV.parameters(), lr= learning_rate, amsgrad=True)

#fig, ax = plt.subplots()
#ax.set_xlabel("iteration")
#ax.set_ylabel("Reward")

x = 0
for m in range(M):
  
  #print("--------------------------------------------------")
  #print("Iteration: " , m, " is in process")
  x = len(Cum_reward_arr)
  for episode in range(episodes):
    itr = 0
    total_reward = 0
    with torch.no_grad():
      while not terminated:
        itr = itr +1
        
        obs = np.array(obs)
        obs = list( obs.flatten())
        Q = AV(torch.tensor(obs, dtype=torch.float))
        
        ## Take Action
        action , act = agent.step(obs, policy, list( Q.numpy() ))
        
        ## Observe reward and state
        new_obs, reward, terminated = env_step(list(action))

        ## Store in D 
        D.append([obs, action, act , reward, new_obs, terminated])

        ## Update State
        obs = new_obs
        
        ## Cumulative reward
        total_reward += reward
        
        if(terminated):
            no_of_timestep.append(itr)
            break
      
      env_reset()
      obs = env_Q_state()
      obs = np.array([obs], dtype='float32')
      terminated = False
      Cum_reward_arr.append(total_reward)
      avg_reward =  0.05 * total_reward + (1-0.05) * avg_reward


  totol_loop = len(D)//miniBatch
  
  
# --------------------------------------------------#
  for k in range(totol_loop):
    ## Sample random from miniBatch
    err = 0
    state_arr = list()
    action_arr = list()
    reward_arr = list()
    next_arr= list()
    exp = list()

    for i in range(miniBatch):
      idx = random.randint(1, len(D) -1)
      st, a, act,  r, st1, terminated = D[idx]

      state_arr.append(st)
      action_arr.append(act)
      #reward_arr.append(r)

      if terminated:
        y = r * torch.ones(10)
      else:
        QT = TA(torch.tensor(st1, dtype=torch.float)) # 30
        grouped_tensor = QT.view(-1, 3)
        # Find maximum value in each group
        max_values = grouped_tensor.max(dim=1).values
        y = r + gamma*max_values

      exp.append(y)  # 10 - 32
    
    state_arr = torch.tensor(state_arr , dtype=torch.float)
    
    #action_arr = torch.tensor(action_arr, dtype=torch.int64)
    #reward_arr = torch.tensor(reward_arr, dtype=torch.float)

    #state_arr = state_arr.view(32)
    #action_arr = action_arr.view(32, 10)

    #print(state_arr.shape)
    #print(action_arr.shape)

    state_action_values = AV(state_arr) # 30 - 32

    out_tensor = torch.empty(10)
    for i in range (10):
      out_tensor[i] = state_action_values[3*i + action_arr[i]]
    
    expected_state_action_values = torch.tensor(exp, dtype=torch.float)

    ## Gradient descent on Q

    # Compute Huber loss
    criterion = nn.MSELoss()
    loss = criterion(out_tensor, expected_state_action_values)
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(AV.parameters(), 100)
    optimizer.step()


    ## Update after Every C step
    if k%C == 0:
      TA.load_state_dict(AV.state_dict())
  
  if (m+1)% 10 == 0:
    plt.plot(Cum_reward_arr)
    plt.plot(avg_reward)
    plt.pause(0.001)
      

TypeError: unsupported operand type(s) for +: 'int' and 'list'

In [55]:
env_state()

[0.11887248450722195,
 0.12742912178680993,
 -0.09231670170576244,
 0.0860346155204499,
 -0.11101797276179945,
 -0.11902312619501815,
 0.1272078410658313,
 -0.09237377788730246,
 0.0864412977180074,
 0.111324775703716]

In [50]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc = nn.Linear(10, 10)

    def forward(self, x):
        x = self.fc(x)
        x = torch.sign(x)
        return x

model = MyModel()

In [53]:
model(torch.randn(1, 10))

tensor([[ 1., -1., -1., -1., -1.,  1., -1.,  1., -1.,  1.]],
       grad_fn=<SignBackward0>)

## THis is directly learning policy not value

In [54]:
import torch
import torch.nn as nn

class AV(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(10, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x)) 
        x = self.custom_activation(x)
        return x

    def custom_activation(self, x):
        x = torch.where(x < -0.33, torch.tensor(-1.0), x)
        x = torch.where((x >= -0.33) & (x <= 0.33), torch.tensor(0.0), x)
        x = torch.where(x > 0.33, torch.tensor(1.0), x)
        return x

model = MyModel()

In [58]:
model(torch.randn(1, 10))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], grad_fn=<WhereBackward0>)