In [1]:
!pip install gymnasium

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gymnasium
  Downloading gymnasium-0.28.1-py3-none-any.whl (925 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting jax-jumpy>=1.0.0 (from gymnasium)
  Downloading jax_jumpy-1.0.0-py3-none-any.whl (20 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, jax-jumpy, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.28.1 jax-jumpy-1.0.0


In [2]:
!pip install gymnasium[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mujoco>=2.3.2 (from gymnasium[mujoco])
  Downloading mujoco-2.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.2->gymnasium[mujoco])
  Downloading glfw-2.5.9-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.8/207.8 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, mujoco
Successfully installed glfw-2.5.9 mujoco-2.3.5


In [None]:
import gymnasium as gym
import numpy as np
import torch
from torch import nn
from torch.nn import functional
import random
from collections import deque

In [14]:
'''
Adding super(classname, self).__init__() in the __init__ method of a class allows you to invoke the constructor of the superclass. This ensures that the
initialization code defined in the superclass is executed before any additional initialization code in the subclass.
'''

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import deque
import gym


class ReplayBuffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)

    def store(self, observation, action, reward, next_observation, done):
        self.memory.append([observation, action, reward, next_observation, done])

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        observations, actions, rewards, next_observations, dones = zip(*batch)
        return np.concatenate(observations), np.concatenate(actions), np.array(rewards), np.concatenate(next_observations), np.array(dones)

    def __len__(self):
        return len(self.memory)

class SoftQNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, alpha):
        super(SoftQNetwork, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.alpha = alpha
        self.fc1 = nn.Linear(self.state_dim + self.action_dim, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, observation):
        #x = torch.cat([state, action], dim=-1)
        x = F.relu(self.fc1(observation))
        x = self.fc2(x)
        return x

    #def forward(self, observation):
    #    x = self.fc1(observation)
    #    x = F.relu(x)
    #    x = self.fc2(x)
    #    x = F.relu(x)
    #    x = self.fc3(x)
    #    return x

    def act(self, state):
        with torch.no_grad():
            q_value = self.forward(state)
            v = self.getV(q_value)
            pi_maxent = torch.exp((q_value - v) / self.alpha)
            pi_maxent = pi_maxent / pi_maxent.sum(dim=-1, keepdim=True)
            action = torch.tensor(np.random.uniform(low=-1.0, high=1.0, size=(self.action_dim,)), dtype=torch.float32)
        return q_value, v, pi_maxent, action

    def getV(self, q_value):
        v = self.alpha * torch.log((1 / self.alpha * q_value).exp().sum(dim=-1, keepdim=True))
        return v


def train(buffer, target_model, eval_model, gamma, optimizer, batch_size, loss_fn, count, update_freq):
    observations, actions, rewards, next_observations, dones = buffer.sample(batch_size)

    observations = torch.FloatTensor(observations)
    actions = torch.FloatTensor(actions)
    rewards = torch.FloatTensor(rewards)
    next_observations = torch.FloatTensor(next_observations)
    dones = torch.FloatTensor(dones)

    q_values = eval_model.forward(observations)
    next_q_values = target_model.forward(next_observations)
    next_v_values = target_model.getV(next_q_values)
    q_value = torch.gather(q_values, 1, actions.unsqueeze(1)).squeeze(1)
    expected_q_value = rewards + gamma * (1 - dones) * next_v_values.squeeze(-1)

    loss = (expected_q_value.detach() - q_value).pow(2).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if count % update_freq == 0:
        target_model.load_state_dict(eval_model.state_dict())

In [15]:
if __name__ == '__main__':
    gamma = 0.99           # discount rate
    learning_rate = 1e-4   # learning rate
    batch_size = 32
    update_freq = 200
    capacity = 5000
    render = False
    episode = 1000
    alpha = 4

    env = gym.make('Pusher-v4')
    env = env.unwrapped
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    target_net = SoftQNetwork(state_dim, action_dim, alpha)
    eval_net = SoftQNetwork(state_dim, action_dim, alpha)
    eval_net.load_state_dict(target_net.state_dict())

    optimizer = torch.optim.Adam(eval_net.parameters(), lr=learning_rate)
    buffer = ReplayBuffer(capacity)
    loss_fn = nn.MSELoss()
    count = 0
    weight_reward = None

    for i in range(episode):
        obs = env.reset()
        reward_total = 0

        if render:
            env.render()

        while True:
            q_value, v, pi_maxent, action = eval_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
            count += 1

            next_obs, reward, done, info, _= env.step(action.numpy())
            buffer.store(obs, action.numpy(), reward, next_obs, done)
            reward_total += reward
            obs = next_obs

            if render:
                env.render()

            if len(buffer.memory) > batch_size:
                train(buffer, target_net, eval_net, gamma, optimizer, batch_size, loss_fn, count, update_freq)

            if done:
                if not weight_reward:
                    weight_reward = reward_total
                else:
                    weight_reward = 0.99 * weight_reward + 0.01 * reward_total

                if (i + 1) % 10 == 0:
                    print('Episode: {}\tReward: {}\tWeighted Reward: {:.3f}'.format(i + 1, reward_total, weight_reward))
                break


RuntimeError: ignored

In [None]:
class ReplayMemory(object):
  def __init__(self, capacity):
    self.capacity = capacity
    self.memory = deque(maxlen=self.capacity)

  def store(self, observation, action, reward, next_observation, terminated):
    observation = np.expand_dims(observation, 0)
    next_observation = np.expand_dims(next_observation, 0)
    self.memory.append([observation, action, reward, next_observation, terminated])

In [None]:
learning_rate = 1e-4
capacity = 50000
render = False
episode = 100
alpha = 4

env = gym.make("MountainCar-v0")
env = env.unwrapped
obs_dim = env.observation_space.shape[0]    # observation size 23
act_dim = env.action_space.n    #   action size 7

In [None]:
env.observation_space

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [None]:
env.action_space

Discrete(3)

In [None]:
target_nn = SoftQLearning(obs_dim, act_dim, alpha)
eval_nn = SoftQLearning(obs_dim, act_dim, alpha)

eval_nn.load_state_dict(target_nn.state_dict())
optimizer = torch.optim.Adam(eval_nn.parameters(), lr=learning_rate)

In [None]:
buffer_mem = ReplayMemory(capacity)
loss_fn = nn.MSELoss()
count = 0

In [None]:
action[1]

IndexError: ignored

In [None]:
weight_reward = None
for i in range(episode)[:5]:
  obs = env.reset()
  total_reward = 0
  if render:
    env.render()
  while True:
    q_value, v, pi_maxent, pi_maxent_, dist, action = eval_nn.act(torch.FloatTensor(np.expand_dims(obs[0], 0)))
    count += 1
    print(q_value, '||', v, '||', pi_maxent, '||', pi_maxent_, '||', dist, '||', action, '\n---------------------------------------\n')
    #print(action.shape)
    count += 1
    next_obs, reward, terminated, truncated, info = env.step(action)
    buffer_mem.store(obs, action, reward, next_obs, terminated)
    total_reward += reward
    obs = next_obs
    if terminated:
      break
    print('\n---------------------------------------\n', count, total_reward)

tensor([[0.0479, 0.0010, 0.0489]]) || tensor([[4.4271]]) || tensor([[0.3346, 0.3307, 0.3347]]) || tensor([[0.3346, 0.3307, 0.3347]]) || Categorical(probs: torch.Size([1, 3])) || 2 
---------------------------------------


---------------------------------------
 2 -1.0


  a = asanyarray(a)


RuntimeError: ignored

In [None]:
env.step?

In [None]:
from torch.distributions import Categorical

# Define the action probabilities
action_probs = torch.tensor([0.1, 0.2, 0.15, 0.05, 0.25, 0.1, 0.15])

# Create a Categorical distribution
dist = Categorical(action_probs)

# Sample an action
action = dist.sample()

print(action.item())  # Print the sampled action

0


In [None]:
dist

Categorical(probs: torch.Size([7]))

In [None]:
import torch
import torch.distributions as D

# Define mean and standard deviation of the distribution
mean = torch.tensor([0.0, 0.0])  # Mean of the distribution
std_dev = torch.tensor([1.0, 1.0])  # Standard deviation of the distribution

# Create a Normal distribution with the specified mean and standard deviation
dist = D.Normal(mean, std_dev)

# Sample an action from the distribution
action = dist.sample()

# Print the sampled action
print(action)

tensor([-0.2513, -1.4424])


In [None]:
dist

Normal(loc: torch.Size([2]), scale: torch.Size([2]))

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import deque
#import gym


class replay_buffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)

    def store(self, observation, action, reward, next_observation, done):
        observation = np.expand_dims(observation, 0)
        next_observation = np.expand_dims(next_observation, 0)
        self.memory.append([observation, action, reward, next_observation, done])

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        observations, actions, rewards, next_observations, dones = zip(* batch)
        return np.concatenate(observations, 0), actions, rewards, np.concatenate(next_observations, 0), dones

    def __len__(self):
        return len(self.memory)


class soft_q_net(nn.Module):
    def __init__(self, observation_dim, action_dim, alpha):
        super(soft_q_net, self).__init__()
        self.observation_dim = observation_dim
        self.action_dim = action_dim
        self.alpha = alpha
        self.fc1 = nn.Linear(self.observation_dim, 64)
        self.fc2 = nn.Linear(64, 256)
        self.fc3 = nn.Linear(256, self.action_dim)

    def forward(self, observation):
        x = self.fc1(observation)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

    def act(self, observation):
        with torch.no_grad():
            q_value = self.forward(observation)
            v = self.getV(q_value)
            pi_maxent = torch.exp((q_value - v) / self.alpha)
            pi_maxent = pi_maxent / pi_maxent.sum(dim=-1, keepdim=True)

            #if pi_maxent[0][0]==float('nan') or pi_maxent[0][1]==float('nan'):
            #  print(q_value)
            #  #print('q_value: ', q_value, '\nv: ', v, '\npi_maxent: ', pi_maxent)
            #  act1 = np.random.uniform(0, 1)
            #  act2 = 1 - act1
            #  pi_maxent[0][0] = act1
            #  pi_maxent[0][1] = act2
            #  dist = torch.distributions.Categorical(pi_maxent)
            #else:
            #  dist = torch.distributions.Categorical(pi_maxent)
            dist = torch.distributions.Categorical(pi_maxent)
            action = dist.sample().item()
        return action

    def getV(self, q_value):
        v = self.alpha * torch.log((1 / self.alpha * q_value).exp().sum(dim=-1, keepdim=True))
        return v


def train(buffer, target_model, eval_model, gamma, optimizer, batch_size, loss_fn, count, update_freq):
    observation, action, reward, next_observation, done = buffer.sample(batch_size)

    observation = torch.FloatTensor(observation)
    action = torch.LongTensor(action)
    reward = torch.FloatTensor(reward)
    next_observation = torch.FloatTensor(next_observation)
    done = torch.FloatTensor(done)

    q_values = eval_model.forward(observation)
    next_q_values = target_model.forward(next_observation)
    next_v_values = target_model.getV(next_q_values)
    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    expected_q_value = reward + gamma * (1 - done) * next_v_values.squeeze(-1)

    #loss = loss_fn(q_value, expected_q_value.detach())
    loss = (expected_q_value.detach() - q_value).pow(2)
    loss = loss.mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if count % update_freq == 0:
        target_model.load_state_dict(eval_model.state_dict())


if __name__ == '__main__':
    gamma = 0.99
    learning_rate = 1e-4
    batch_size = 32
    update_freq = 200
    capacity = 50000
    render = False
    episode = 1000 #100000
    alpha = 4

    #env = gym.make('CartPole-v0')
    env = gym.make('Pusher-v4')
    env = env.unwrapped
    observation_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    target_net = soft_q_net(observation_dim, action_dim, alpha)
    eval_net = soft_q_net(observation_dim, action_dim, alpha)
    eval_net.load_state_dict(target_net.state_dict())
    optimizer = torch.optim.Adam(eval_net.parameters(), lr=learning_rate)
    buffer = replay_buffer(capacity)
    loss_fn = nn.MSELoss()
    count = 0

    weight_reward = None
    for i in range(episode):
        obs = env.reset()
        reward_total = 0
        if render:
            env.render()
        while True:
            action = eval_net.act(torch.FloatTensor(np.expand_dims(obs[0], 0)))
            count += 1
            next_obs, reward, done, info, _ = env.step(action)
            buffer.store(obs, action, reward, next_obs, done)
            reward_total += reward
            obs = next_obs
            if render:
                env.render()
            if len(buffer.memory) > batch_size:
                train(buffer, target_net, eval_net, gamma, optimizer, batch_size, loss_fn, count, update_freq)

            if done:
                if not weight_reward:
                    weight_reward = reward_total
                else:
                    weight_reward = 0.99 * weight_reward + 0.01 * reward_total
                if (i+1) % 10 == 0:
                    print('episode: {}\treward: {}\tweight_reward: {:.3f}'.format(i+1, reward_total, weight_reward))
                break


ValueError: ignored

In [17]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class SoftQNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(SoftQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def store(self, transition):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = transition
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), size=batch_size, replace=False)
        states, actions, rewards, next_states, dones = zip(*[self.buffer[i] for i in indices])
        return (
            torch.FloatTensor(states),
            torch.FloatTensor(actions),
            torch.FloatTensor(rewards),
            torch.FloatTensor(next_states),
            torch.FloatTensor(dones)
        )

    def __len__(self):
        return len(self.buffer)


def soft_q_update(buffer, q_network, target_q_network, optimizer, gamma, alpha, batch_size):
    states, actions, rewards, next_states, dones = buffer.sample(batch_size)

    values = q_network(states, actions)
    next_values = target_q_network(next_states, actions)
    expected_values = rewards + gamma * (1 - dones) * next_values.detach()

    q_loss = F.mse_loss(values, expected_values)

    optimizer.zero_grad()
    q_loss.backward()
    optimizer.step()

    soft_update(q_network, target_q_network, alpha)


def soft_update(source_network, target_network, alpha):
    for target_param, source_param in zip(target_network.parameters(), source_network.parameters()):
        target_param.data.copy_(alpha * source_param.data + (1 - alpha) * target_param.data)


def train_soft_q_learning(env, q_network, target_q_network, optimizer, buffer, gamma, alpha, batch_size, max_episodes):
    total_steps = 0

    for episode in range(1, max_episodes + 1):
        episode_reward = 0
        episode_steps = 0
        done = False
        state = env.reset()

        while not done:
            action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)

            buffer.store((state, action, reward, next_state, done))

            episode_reward += reward
            episode_steps += 1
            total_steps += 1

            if len(buffer) >= batch_size:
                soft_q_update(buffer, q_network, target_q_network, optimizer, gamma, alpha, batch_size)

            state = next_state

        print(f"Episode {episode}: Reward: {episode_reward}, Steps: {episode_steps}")

    env.close()


if __name__ == "__main__":
    env = gym.make("Pusher-v4")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    hidden_dim = 128
    capacity = 10000
    batch_size = 128
    gamma = 0.99
    alpha = 0.5
    max_episodes = 1000

    q_network = SoftQNetwork(state_dim, action_dim, hidden_dim)
    target_q_network = SoftQNetwork(state_dim, action_dim, hidden_dim)
    target_q_network.load_state_dict(q_network.state_dict())

    optimizer = optim.Adam(q_network.parameters(), lr=0.001)
    buffer = ReplayBuffer(capacity)

    train_soft_q_learning(env, q_network, target_q_network, optimizer, buffer, gamma, alpha, batch_size, max_episodes)


Episode 1: Reward: -148.00540704526807, Steps: 100


  torch.FloatTensor(states),
  q_loss = F.mse_loss(values, expected_values)


Episode 2: Reward: -148.21274155954217, Steps: 100
Episode 3: Reward: -151.33541357304625, Steps: 100
Episode 4: Reward: -142.94611849432843, Steps: 100
Episode 5: Reward: -143.69700038713898, Steps: 100
Episode 6: Reward: -152.30443042529657, Steps: 100
Episode 7: Reward: -146.09847616771478, Steps: 100
Episode 8: Reward: -136.29330986773854, Steps: 100
Episode 9: Reward: -133.26754849333935, Steps: 100
Episode 10: Reward: -147.2130658055694, Steps: 100
Episode 11: Reward: -154.66053185864877, Steps: 100
Episode 12: Reward: -149.73284543011607, Steps: 100
Episode 13: Reward: -137.8332685335201, Steps: 100
Episode 14: Reward: -139.98263710736353, Steps: 100
Episode 15: Reward: -147.56984162676372, Steps: 100
Episode 16: Reward: -153.33147186329222, Steps: 100
Episode 17: Reward: -148.45258005163163, Steps: 100
Episode 18: Reward: -155.03151763761386, Steps: 100
Episode 19: Reward: -157.50368878114577, Steps: 100
Episode 20: Reward: -160.29162392869267, Steps: 100
Episode 21: Reward: -1

KeyboardInterrupt: ignored