In [3]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn.utils import clip_grad_norm_
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv


In [4]:
# Define your generator model
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(100, 128)
        self.fc2 = nn.Linear(128, 8)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        return x

# Define your supervised model


class Supervised(nn.Module):
    def __init__(self):
        super(Supervised, self).__init__()
        self.fc1 = nn.Linear(8, 16)
        self.fc2 = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x


In [5]:
# Define your reinforcement learning environment
class RL_Environment(gym.Env):
    def __init__(self, generator_model, supervised_model):
        super(RL_Environment, self).__init__()
        self.generator_model = generator_model
        self.supervised_model = supervised_model
        self.state = torch.randn(100)
        self.action_space = gym.spaces.Discrete(8)
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(100,), dtype=np.float32)

    def step(self, action):
        self.state = torch.randn(100)
        output = self.generator_model(self.state)
        output[action] = torch.normal(mean=output[action], std=0.1)
        reward = self.supervised_model(output)
        done = False
        info = {}
        return self.state, reward, done, info

    def reset(self):
        self.state = torch.randn(100)
        return self.state

In [6]:
# Create the generator, supervised model, and RL environment
generator = Generator()
supervised = Supervised()
env = DummyVecEnv([lambda: RL_Environment(generator, supervised)])

In [8]:
generator

Generator(
  (fc1): Linear(in_features=100, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=8, bias=True)
)

In [7]:
generator.state_dict()

OrderedDict([('fc1.weight',
              tensor([[ 0.0790, -0.0504,  0.0015,  ...,  0.0275,  0.0671, -0.0476],
                      [-0.0224,  0.0156, -0.0068,  ..., -0.0809, -0.0520,  0.0032],
                      [ 0.0374, -0.0997,  0.0708,  ..., -0.0996, -0.0669, -0.0546],
                      ...,
                      [ 0.0011, -0.0827, -0.0155,  ...,  0.0366,  0.0976,  0.0357],
                      [-0.0547,  0.0419, -0.0867,  ...,  0.0893, -0.0919,  0.0829],
                      [-0.0208,  0.0647, -0.0164,  ...,  0.0685, -0.0979, -0.0175]])),
             ('fc1.bias',
              tensor([-0.0380,  0.0879,  0.0666, -0.0063, -0.0430, -0.0944,  0.0605,  0.0428,
                      -0.0398,  0.0429, -0.0575, -0.0127,  0.0801,  0.0998, -0.0654,  0.0181,
                      -0.0815,  0.0084, -0.0704,  0.0989, -0.0003, -0.0740,  0.0155, -0.0348,
                      -0.0128,  0.0215,  0.0387,  0.0007,  0.0400,  0.0487, -0.0617,  0.0661,
                      -0.0243, -0.01

In [9]:
# Train the generator using PPO
model = PPO('MlpPolicy', env, verbose=1)

Using cuda device


In [10]:
model.learn(total_timesteps=10000)

-----------------------------
| time/              |      |
|    fps             | 880  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 779          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0054835184 |
|    clip_fraction        | 0.0129       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.08        |
|    explained_variance   | -1.28        |
|    learning_rate        | 0.0003       |
|    loss                 | 27.1         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0216      |
|    value_loss           | 72.8         |
------------------------------------------
----------------

<stable_baselines3.ppo.ppo.PPO at 0x7f1f787f77f0>

In [17]:
model.policy

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=8, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

In [22]:
generator_model = model.policy.to('cpu').mlp_extractor.to('cpu')
action_net = model.policy.to('cpu').action_net.to('cpu')

In [15]:
generator_model

MlpExtractor(
  (policy_net): Sequential(
    (0): Linear(in_features=100, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
  )
  (value_net): Sequential(
    (0): Linear(in_features=100, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
  )
)

In [25]:
random_input = torch.randn(100)
action_net(generator_model(random_input)[0])

tensor([ 0.1221,  0.2582,  0.2215, -0.4020,  0.2164,  0.2142, -0.3613, -0.1808],
       grad_fn=<AddBackward0>)

In [None]:
import torch
from stable_baselines3 import PPO

# Create an instance of the PPO model and train it
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

# Retrieve the generator from the policy
policy = model.policy
mlp_extractor = policy.mlp_extractor
action_net = policy.action_net

# Generate an input tensor with dimensions 1 x 100
input_tensor = torch.randn(1, 100)


In [27]:


# Pass the input tensor through the mlp_extractor to get the output tensor
mlp_output = mlp_extractor(input_tensor.to(device))

# Pass the mlp_output through the action_net to get the action logits
action_logits = action_net(mlp_output.to('cpu'))

# Apply the softmax function to the action logits to get the action probabilities
action_probabilities = torch.softmax(action_logits, dim=1)

# Print the output tensor and action probabilities
print("Action logits dimensions:", action_logits.shape)
print("Action probabilities dimensions:", action_probabilities.shape)


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)