In [2]:
!pip install gym[classic_control]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0 (from gym[classic_control])
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.3.0
    Uninstalling pygame-2.3.0:
      Successfully uninstalled pygame-2.3.0
Successfully installed pygame-2.1.0


In [70]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import deque
import gym

In [71]:
## Definining SQL Architecture

class Soft_Q_Net(nn.Module):
  def __init__(self, observation_dim, action_dim, alpha):
    super(Soft_Q_Net, self).__init__()
    self.alpha = alpha
    self.observation_dim = observation_dim
    self.action_dim = action_dim
    # define network architecture
    self.FC1 = nn.Linear(self.observation_dim, 64)    # input layer
    self.FC2 = nn.Linear(64, 256)                     # hidden layer
    self.FC3 = nn.Linear(256, self.action_dim)        # output layer

  # network connecting
  def forward_pass(self, observation):
    x = self.FC1(observation)
    x = F.relu(x)
    x = self.FC2(x)
    x = F.relu(x)
    x = self.FC3(x)
    return x

  # selecting action
  def act(self, observation):
    with torch.no_grad():
      q_val = self.forward_pass(observation)                        # estimating soft Q value function
      v_val = self.get_V_val(q_val)                                  # estimating soft V value function
      pi_maxent = torch.exp((q_val - v_val) / self.alpha)
      pi_maxent = pi_maxent / pi_maxent.sum(dim=-1, keepdim=True)
      pi_maxent[torch.isnan(pi_maxent)] = 1e-6                      # set pi_maxent value to a small value incase numerical instability (Avoid output 'NaN')
      policy_dist = torch.distributions.Categorical(pi_maxent)
      action = policy_dist.sample().item()                          # sample an action from Categorical distribution
    return q_val, v_val, pi_maxent, action

  # defining soft V value function
  def get_V_val(self, q_value):
    v = self.alpha * torch.log((1 / self.alpha * q_value).exp().sum(dim=-1, keepdim=True))
    return v

In [72]:
class buffer_memory(object):
  # initializing buffer memory using deque
  def __init__(self, memory_size):
    self.memory_size = memory_size
    self.memory = deque(maxlen=self.memory_size)

  # to store state, action and reward onto buffer
  def store(self, observation, action, reward, next_observation, done):
    observation = np.expand_dims(observation, 0)
    next_observation = np.expand_dims(next_observation, 0)
    self.memory.append([observation, action, reward, next_observation, done])

  # to sample for training
  def sample(self, batch_size):
    sampl_batch = random.sample(self.memory, batch_size)                          # randomly sample set of state measures
    observations, actions, rewards, next_observations, dones = zip(*sampl_batch)
    observations = np.concatenate(observations, 0)                                # convert single state 2d obs array into batch size of 2d array
    next_observations = np.concatenate(next_observations, 0)
    return observations, actions, rewards, next_observations, dones



In [73]:
def train(buffer, target_model, eval_model, gamma, optimizer, batch_size, loss_fn, count, update_freq):
  # collect a batch of random samples
  observations, actions, rewards, next_observations, dones = buffer.sample(batch_size)

  # convert each arrays to tensor type
  observations = torch.FloatTensor(observations)
  actions = torch.LongTensor(actions)
  rewards = torch.FloatTensor(rewards)
  next_observations = torch.FloatTensor(next_observations)
  dones = torch.FloatTensor(dones)

  q_vals = eval_model.forward_pass(observations)                                      # get Qt values (2d tensor of two element) for all observation (observation is 2d) from eval model
  next_q_vals = target_model.forward_pass(next_observations)                          # get Qt+1 values for all next observation from target model
  next_v_vals = target_model.get_V_val(next_q_vals)                                        # get Vt+1 values (2d tensor of single element) for all Qt+1 from target model

  q_vals = q_vals.gather(1, actions.unsqueeze(1)).squeeze(1)                          # get back Vt values of all corresponding action
  expected_q_vals = rewards + gamma * (1 - dones) * next_v_vals.squeeze(-1)

  # calculate loss
  loss = (expected_q_vals.detach() - q_vals).pow(2)
  loss = loss.mean()

  optimizer.zero_grad()         # set eval_model gradient to none
  loss.backward()               # computes the gradient w.r.t loss
  optimizer.step()              # Performs a single optimization step (parameter update)

  if count % update_freq == 0:  # update target model for every 200 steps by sharing the params of eval model
    target_model.load_state_dict(eval_model.state_dict())

  return loss

In [80]:
gamma = 0.99             # discount rate
learning_rate = 0.0001   # learning rate
batch_size = 32          # training batch size
update_freq = 200        # update target network for every 200 steps (after every 200 state)
capacity = 5000   # size of buffer memory
render = False           # renedering of cartpole window
episode = 1000    # Total episode
alpha = 4                # entropy/temperature coefficient

In [75]:
env = gym.make('CartPole-v1')
env = env.unwrapped
observation_dim = env.observation_space.shape[0]    # State space size: 4
action_dim = env.action_space.n                     # Action space size: 2
print(observation_dim, '||', action_dim)

4 || 2


  deprecation(
  deprecation(


In [81]:
target_net = Soft_Q_Net(observation_dim, action_dim, alpha)   # initializing target nn
eval_net = Soft_Q_Net(observation_dim, action_dim, alpha)     # initializing evaluation nn
eval_net.load_state_dict(target_net.state_dict())             # loading initialized params (weights and biases) of target nn to eval nn
print(target_net.state_dict()['FC1.weight'].shape)
print(target_net.state_dict()['FC2.weight'].shape)

torch.Size([64, 4])
torch.Size([256, 64])


In [82]:
optimizer = torch.optim.Adam(eval_net.parameters(), lr=learning_rate)   # optimizer
buffer = buffer_memory(capacity)                                        # initialize buffer memory
loss_fn = nn.MSELoss()                                                  # to measures the mean squared error

In [83]:
count = 0
weight_reward = None

for i in range(episode):
  # within each episode
  obs = env.reset()     # get initial state observation from env
  reward_total = 0      # total reward got in a episode
  if render:
    env.render()
  while True:
    obs_2d_tensor = torch.FloatTensor(np.expand_dims(obs, 0))               # convert 1d array to 2d array then to float tensor
    q_val, v_val, pi_maxent, action = eval_net.act(obs_2d_tensor)           # selecting action from agent sampling distribution
    #print(q_value, '||', v, '||', pi_maxent, '||', dist, '||', action, '\nTotal reward: ', reward_total, 'count: ', count, '\n-------------------------------------\n')
    count += 1                                                              # indicate number of state agent covered till present
    next_obs, reward, done, info, _ = env.step(action)                      # taking sampled action on environment
    buffer.store(obs, action, reward, next_obs, done)                       # storing the st, at, rt+1, st+1 into buffer
    reward_total += reward                                                  # incrementing episode reward with current state reward
    obs = next_obs                                                          # set next state as current state
    if render:
      env.render()
    if len(buffer.memory) > batch_size:                                     # if buffer have more new samples than batch size (32); trainig will be done
      loss = train(buffer, target_net, eval_net, gamma, optimizer, batch_size, loss_fn, count, update_freq)
    if done:
      if not weight_reward:
        weight_reward = reward_total
      else:
        weight_reward = 0.99 * weight_reward + 0.01 * reward_total          # a relative current episode reward with past episodes reward
      if (i+1) % 10 == 0:
        print('episode: {}\treward: {}\tweight_reward: {:.3f}\tepisode loss: {:.3f}'.format(i+1, reward_total, weight_reward, loss))
      break


episode: 10	reward: 50.0	weight_reward: 14.756	episode: 12.042
episode: 20	reward: 14.0	weight_reward: 15.179	episode: 13.304
episode: 30	reward: 30.0	weight_reward: 16.046	episode: 3.320
episode: 40	reward: 18.0	weight_reward: 16.707	episode: 5.389
episode: 50	reward: 21.0	weight_reward: 17.260	episode: 0.886
episode: 60	reward: 26.0	weight_reward: 18.058	episode: 27.478
episode: 70	reward: 19.0	weight_reward: 17.893	episode: 31.516
episode: 80	reward: 14.0	weight_reward: 18.150	episode: 43.040
episode: 90	reward: 16.0	weight_reward: 18.129	episode: 28.592
episode: 100	reward: 22.0	weight_reward: 18.593	episode: 2.018
episode: 110	reward: 49.0	weight_reward: 20.065	episode: 48.784
episode: 120	reward: 35.0	weight_reward: 20.414	episode: 29.981
episode: 130	reward: 20.0	weight_reward: 22.226	episode: 9.838
episode: 140	reward: 42.0	weight_reward: 24.606	episode: 42.610
episode: 150	reward: 69.0	weight_reward: 26.311	episode: 8.045
episode: 160	reward: 38.0	weight_reward: 28.311	episode

In [12]:
len(buffer.memory)

2162

In [13]:
buffer.memory[10]

[array([[ 0.0013592 ,  0.01534796, -0.02282282, -0.17622472]],
       dtype=float32),
 1,
 1.0,
 array([[ 0.00166616,  0.21078897, -0.02634731, -0.47601923]],
       dtype=float32),
 False]

In [50]:
buffer.memory[0][0]

  and should_run_async(code)


array([[-0.03908823,  0.04060626, -0.00110443,  0.01397937]],
      dtype=float32)

In [14]:
sampl_batch = random.sample(buffer.memory, 10)

In [15]:
sampl_batch[:5]

[[array([[ 0.03625979,  0.1662466 ,  0.02094655, -0.2348625 ]],
        dtype=float32),
  0,
  1.0,
  array([[ 0.03958472, -0.02916827,  0.0162493 ,  0.06435332]],
        dtype=float32),
  False],
 [array([[-0.15765925, -0.43256417,  0.1352532 ,  0.7636303 ]],
        dtype=float32),
  0,
  1.0,
  array([[-0.16631053, -0.6292639 ,  0.15052581,  1.0956285 ]],
        dtype=float32),
  False],
 [array([[ 0.04245607, -0.55654204,  0.01548218,  0.9868609 ]],
        dtype=float32),
  1,
  1.0,
  array([[ 0.03132522, -0.36163083,  0.03521939,  0.69908065]],
        dtype=float32),
  False],
 [array([[ 0.0777887 ,  0.38953334, -0.08042865, -0.59207433]],
        dtype=float32),
  1,
  1.0,
  array([[ 0.08557937,  0.58568364, -0.09227013, -0.9089692 ]],
        dtype=float32),
  False],
 [array([[-0.02357133,  0.42358264, -0.02605627, -0.5804774 ]],
        dtype=float32),
  1,
  1.0,
  array([[-0.01509967,  0.61905986, -0.03766582, -0.88125336]],
        dtype=float32),
  False]]

In [16]:
observations, actions, rewards, next_observations, dones = zip(*sampl_batch)

In [17]:
dones

(False, False, False, False, False, False, False, False, False, False)

In [18]:
observations

(array([[ 0.03625979,  0.1662466 ,  0.02094655, -0.2348625 ]],
       dtype=float32),
 array([[-0.15765925, -0.43256417,  0.1352532 ,  0.7636303 ]],
       dtype=float32),
 array([[ 0.04245607, -0.55654204,  0.01548218,  0.9868609 ]],
       dtype=float32),
 array([[ 0.0777887 ,  0.38953334, -0.08042865, -0.59207433]],
       dtype=float32),
 array([[-0.02357133,  0.42358264, -0.02605627, -0.5804774 ]],
       dtype=float32),
 array([[ 0.06150035,  0.15353021, -0.01669684, -0.276384  ]],
       dtype=float32),
 array([[ 0.0156896 ,  0.19253561,  0.00867473, -0.25639054]],
       dtype=float32),
 array([[ 0.01074169, -0.03272189,  0.04716135,  0.04909102]],
       dtype=float32),
 array([[-0.01720894, -0.00964926, -0.08673678, -0.15370457]],
       dtype=float32),
 array([[ 0.00637893, -0.13761407, -0.06826991,  0.05828309]],
       dtype=float32))

In [65]:
observations[0]

  and should_run_async(code)


array([[ 0.07052451,  0.43556803, -0.09915239, -0.6625491 ]],
      dtype=float32)

In [19]:
observation = np.concatenate(observations, 0)
observation

array([[ 0.03625979,  0.1662466 ,  0.02094655, -0.2348625 ],
       [-0.15765925, -0.43256417,  0.1352532 ,  0.7636303 ],
       [ 0.04245607, -0.55654204,  0.01548218,  0.9868609 ],
       [ 0.0777887 ,  0.38953334, -0.08042865, -0.59207433],
       [-0.02357133,  0.42358264, -0.02605627, -0.5804774 ],
       [ 0.06150035,  0.15353021, -0.01669684, -0.276384  ],
       [ 0.0156896 ,  0.19253561,  0.00867473, -0.25639054],
       [ 0.01074169, -0.03272189,  0.04716135,  0.04909102],
       [-0.01720894, -0.00964926, -0.08673678, -0.15370457],
       [ 0.00637893, -0.13761407, -0.06826991,  0.05828309]],
      dtype=float32)

In [64]:
actions

(1, 1, 1, 1, 1, 0, 0, 1, 1, 1)

In [29]:
action = torch.LongTensor(actions)
action

  and should_run_async(code)


tensor([0, 0, 1, 1, 1, 1, 1, 0, 1, 1])

In [70]:
observation[0]

  and should_run_async(code)


array([ 0.07052451,  0.43556803, -0.09915239, -0.6625491 ], dtype=float32)

In [20]:
observation_tensor = torch.FloatTensor(observation)
observation_tensor

tensor([[ 0.0363,  0.1662,  0.0209, -0.2349],
        [-0.1577, -0.4326,  0.1353,  0.7636],
        [ 0.0425, -0.5565,  0.0155,  0.9869],
        [ 0.0778,  0.3895, -0.0804, -0.5921],
        [-0.0236,  0.4236, -0.0261, -0.5805],
        [ 0.0615,  0.1535, -0.0167, -0.2764],
        [ 0.0157,  0.1925,  0.0087, -0.2564],
        [ 0.0107, -0.0327,  0.0472,  0.0491],
        [-0.0172, -0.0096, -0.0867, -0.1537],
        [ 0.0064, -0.1376, -0.0683,  0.0583]])

In [23]:
q_vals = eval_net.forward_pass(observation_tensor)

  and should_run_async(code)


In [24]:
q_vals

tensor([[0.0873, 0.0421],
        [0.0607, 0.0558],
        [0.0572, 0.0555],
        [0.1014, 0.0523],
        [0.1045, 0.0474],
        [0.0863, 0.0464],
        [0.0893, 0.0428],
        [0.0708, 0.0329],
        [0.0786, 0.0423],
        [0.0624, 0.0382]], grad_fn=<AddmmBackward0>)

In [26]:
v_vals = eval_net.getV_val(q_vals)
v_vals

  and should_run_async(code)


tensor([[2.8374],
        [2.8309],
        [2.8290],
        [2.8495],
        [2.8486],
        [2.8390],
        [2.8387],
        [2.8245],
        [2.8331],
        [2.8229]], grad_fn=<MulBackward0>)

In [43]:
v_vals.squeeze(-1)

tensor([2.8374, 2.8309, 2.8290, 2.8495, 2.8486, 2.8390, 2.8387, 2.8245, 2.8331,
        2.8229], grad_fn=<SqueezeBackward1>)

In [33]:
action

tensor([0, 0, 1, 1, 1, 1, 1, 0, 1, 1])

In [32]:
action.unsqueeze(1) # unsqueeze add extra dim

tensor([[0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1]])

In [36]:
action.unsqueeze(0)

tensor([[0, 0, 1, 1, 1, 1, 1, 0, 1, 1]])

In [41]:
action.unsqueeze(1)

tensor([[0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1]])

In [42]:
q_vals.gather(1, action.unsqueeze(1))

tensor([[0.0873],
        [0.0607],
        [0.0555],
        [0.0523],
        [0.0474],
        [0.0464],
        [0.0428],
        [0.0708],
        [0.0423],
        [0.0382]], grad_fn=<GatherBackward0>)

In [38]:
q_vals.gather(1, action.unsqueeze(1)).squeeze(1)

tensor([0.0873, 0.0607, 0.0555, 0.0523, 0.0474, 0.0464, 0.0428, 0.0708, 0.0423,
        0.0382], grad_fn=<SqueezeBackward1>)

In [39]:
q_val = q_vals.gather(1, action.unsqueeze(1)).squeeze(1)
q_val

  and should_run_async(code)


tensor([0.0873, 0.0607, 0.0555, 0.0523, 0.0474, 0.0464, 0.0428, 0.0708, 0.0423,
        0.0382], grad_fn=<SqueezeBackward1>)

In [45]:
q_vals

tensor([[0.0873, 0.0421],
        [0.0607, 0.0558],
        [0.0572, 0.0555],
        [0.1014, 0.0523],
        [0.1045, 0.0474],
        [0.0863, 0.0464],
        [0.0893, 0.0428],
        [0.0708, 0.0329],
        [0.0786, 0.0423],
        [0.0624, 0.0382]], grad_fn=<AddmmBackward0>)

In [46]:
q_vals.detach?

In [52]:
observations, actions, rewards, next_observations, dones = buffer.sample(10)

# convert each arrays to tensor type
observations = torch.FloatTensor(observations)
actions = torch.LongTensor(actions)
rewards = torch.FloatTensor(rewards)
next_observations = torch.FloatTensor(next_observations)
dones = torch.FloatTensor(dones)

q_vals = eval_net.forward_pass(observations)                                      # get Qt values (2d tensor of two element) for all observation (observation is 2d) from eval model
next_q_vals = target_net.forward_pass(next_observations)                          # get Qt+1 values for all next observation from target model
next_v_vals = target_net.getV_val(next_q_vals)                                        # get Vt+1 values (2d tensor of single element) for all Qt+1 from target model

q_vals = q_vals.gather(1, actions.unsqueeze(1)).squeeze(1)                          # get back Vt values of all corresponding action
expected_q_vals = rewards + gamma * (1 - dones) * next_v_vals.squeeze(-1)

# calculate loss
loss = (expected_q_vals.detach() - q_vals).pow(2)
loss = loss.mean()

  and should_run_async(code)


In [57]:
optimizer.zero_grad

In [53]:
loss

tensor(12.8311, grad_fn=<MeanBackward0>)

In [54]:
loss.backward?

In [60]:
optimizer.step()