In [3]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import import_ipynb


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np

In [2]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [3]:
import gym_examples

In [4]:
from gymnasium.wrappers import FlattenObservation, NormalizeObservation

In [5]:
from gymnasium.spaces.utils import unflatten


In [6]:
import gymnasium as gym
import numpy as np


class NormalizeFloatObservation(gym.Wrapper):
    def __init__(self, env, binary_indices, float_indices, epsilon=1e-8):
        """
        A custom Gymnasium wrapper to normalize only float values in the observation space.

        Args:
            env (gym.Env): The environment to wrap.
            binary_indices (list): Indices of binary features in the observation.
            float_indices (list): Indices of float features in the observation.
            epsilon (float): A small value to prevent division by zero during normalization.
        """
        super().__init__(env)
        self.binary_indices = binary_indices
        self.float_indices = float_indices
        self.epsilon = epsilon

        # Initialize normalization statistics for float values
        self.float_mean = np.array([5.0, 0.0])#np.zeros(len(float_indices))
        self.float_std =np.array([10, 4000])# np.ones(len(float_indices))

    def reset(self, **kwargs):
        """
        Resets the environment and normalizes the observation.

        Returns:
            np.ndarray: The normalized observation.
            dict: Additional information from the environment reset.
        """
        obs, info = self.env.reset(**kwargs)
        return self._normalize(obs), info

    def step(self, action):
        """
        Steps through the environment and normalizes the observation.

        Args:
            action: The action to take in the environment.

        Returns:
            tuple: Normalized observation, reward, termination flag, truncation flag, and additional info.
        """
        obs, reward, terminated, truncated, info = self.env.step(action)
        return self._normalize(obs), reward, terminated, truncated, info

    def _normalize(self, obs):
        """
        Normalizes only the float values in the observation.

        Args:
            obs (np.ndarray): The raw observation.

        Returns:
            np.ndarray: The normalized observation.
        """
        normalized_obs = obs.copy()

        # Normalize float values using mean and std
        normalized_obs[self.float_indices] = (
            (obs[self.float_indices] - self.float_mean) /
            (self.float_std + self.epsilon)
        )

        # Binary values remain unchanged
        return normalized_obs

    def update_statistics(self, observations):
        """
        Updates normalization statistics (mean and std) based on a batch of observations.

        Args:
            observations (np.ndarray): A batch of observations to calculate statistics from.
                                        Assumes shape (batch_size, num_features).
        """
        # Extract only float values from batch observations
        float_values = observations[:, self.float_indices]

        # Update mean and standard deviation for float features
        self.float_mean = np.mean(float_values, axis=0)
        self.float_std = np.std(float_values, axis=0)

    def unnormalize(self, normalized_obs):
        """
        Unnormalizes previously normalized float values.

        Args:
            normalized_obs (np.ndarray): The normalized observation.

        Returns:
            np.ndarray: The unnormalized observation.
        """
        unnormalized_obs = normalized_obs.copy()

        # Reverse normalization for float values
        unnormalized_obs[self.float_indices] = (
            normalized_obs[self.float_indices] * (self.float_std + self.epsilon) +
            self.float_mean
        )

        return unnormalized_obs

    def reset_normalization(self):
        """
        Resets normalization statistics (mean and std).
        """
        self.float_mean = np.zeros(len(self.float_indices))
        self.float_std = np.ones(len(self.float_indices))



In [7]:
env = gym.make('gym_examples/TraceControlEnv-v0', delay=2)

100%|██████████| 2/2 [00:00<00:00,  2.34it/s]


In [8]:
env_orig = gym.make('gym_examples/TraceControlEnv-v0', delay=2)

100%|██████████| 2/2 [00:00<00:00,  2.13it/s]


In [9]:
env = FlattenObservation(env)
env = NormalizeFloatObservation(env, [x for x in range(50)], [50, 51])



In [10]:
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

<matplotlib.pyplot._IonContext at 0x7fbba01b5a30>

In [11]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [12]:

class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 512)
        self.ln1 = nn.LayerNorm(512)  # Layer Normalization after layer 1
        self.layer2 = nn.Linear(512, 512)
        self.ln2 = nn.LayerNorm(512)  # Layer Normalization after layer 2
        self.layer3 = nn.Linear(512, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = self.ln1(x)  # Apply Layer Norm after activation of layer 1
        x = F.relu(self.layer2(x))
        x = self.ln2(x)  # Apply Layer Norm after activation of layer 2
        return self.layer3(x)

In [13]:
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

In [14]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer
BATCH_SIZE = 128
GAMMA = 0.5#0.4
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 5000
TAU = 0.001
LR = 1e-2

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state, info = env.reset()
n_observations = len(state)

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(100000)


steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    #print(eps_threshold)
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            
            denormalized_obs = env.unnormalize(state.cpu().squeeze(0).numpy())
            unflattened_obs = unflatten(env_orig.observation_space, denormalized_obs)
           # print(unflattened_obs, policy_net(state),  policy_net(state).max(1).indices.view(1, 1))
            return policy_net(state).max(1).indices.view(1, 1)
    else:
        print('r')
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)


episode_aois = []


def plot_aois(show_result=False):
  #  plt.figure(1)
    aois_t = torch.tensor(episode_aois, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('AoI')
    plt.plot(aois_t.numpy())
    # Take 100 episode averages and plot them too
    if len(aois_t) >= 100:
        means = aois_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())
    plt.show()

    #plt.pause(0.001)  # pause a bit so that plots are updated
    #if is_ipython:
   #     if not show_result:
     #       display.display(plt.gcf())
     #       display.clear_output(wait=True)
     #   else:
    #        display.display(plt.gcf())
    
episode_vals = []
episode_durations = []

def plot_vals(show_result=False):
  #  plt.figure(1)
    vals_t = torch.tensor(episode_vals, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
       # plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Value')
    display.clear_output(wait=True)
    plt.plot(-vals_t.numpy())
    plt.yscale('log')
    #plt.plot(vals_t.numpy())
    # Take 100 episode averages and plot them too
    if len(vals_t) >= 10:
        means = vals_t.unfold(0, 10, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(9), means))
        plt.plot(-means.numpy())
    #plt.show()
   # plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
          #  display.clear_output(wait=True)
        else:
            display.display(plt.gcf())
            
def plot_durations(show_result=False):
  #  plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
        #plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Value')
    display.clear_output(wait=True)
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())
    #plt.show()

   # plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
          #  display.clear_output(wait=True)
        else:
            display.display(plt.gcf())

  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")


obs after reset  {'val': -8.0, 'bw': 4.0, 'acc_history': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0])}


In [15]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1).values
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
   # print(np.mean(expected_state_action_values.cpu().numpy()))
    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    return np.mean(expected_state_action_values.detach().cpu().numpy())

In [17]:
env.set_cost(2)#2

  logger.warn(


In [18]:
env.set_cost2(10)#10

  logger.warn(


In [19]:
env.set_cost3(0)

  logger.warn(


In [20]:
env.set_cost4(0)#10

  logger.warn(


In [21]:
q_vals_means = []


In [22]:
GAMMA = 0.5#0.4


In [1]:
if torch.cuda.is_available() or torch.backends.mps.is_available():
    num_episodes = 30
else:
    num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and get its state
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    reward_accum = 0
    q_vals = []
    for t in count():
        action = select_action(state)
       # print(action)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([np.float32(reward)], device=device)
        reward_accum += reward
       
        done = terminated or truncated
        if t > 1000:
            done = 1
        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        q_val = optimize_model()
        q_vals.append(q_val)

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)

        if done:
            q_vals_means.append(np.mean(q_vals))
            #if np.abs(np.mean(q_vals)) < 1000:
            #    break
            plt.plot(q_vals_means)
            plt.savefig('loss.png', dpi=300)
            plt.show()
            episode_vals.append(reward_accum/t)
            #episode_aois.append(reward_accum / (t+1))
            episode_durations.append(t + 1)
           # print(observation)
            #denormalized_obs = observation * np.sqrt(env.obs_rms.var + 1e-8)  + env.obs_rms.mean
            #print(denormalized_obs, denormalized_obs.shape, env_orig.observation_space.shape)
            print(observation)
            denormalized_obs = env.unnormalize(observation)
            print(denormalized_obs)
            unflattened_obs = unflatten(env_orig.observation_space, denormalized_obs)
            print(unflattened_obs)
            #print(unflatten(env_orig.observation_space, observation))
            #plot_aois()
           # plot_vals()
            print('tot duration ', t)
            print('tot val ', reward_accum)
            break

print('Complete')
#plot_vals(show_result=True)


NameError: name 'torch' is not defined

In [None]:
torch.save(target_net.state_dict(), 'trained_model_pid.pth')