In [1]:
# {'pkts_dropped': 0.0,
#  'pkts_transmitted': 75.0,
#  'timestamp': '1626810037874447104',
#  'obj': 'simulation_pedestrian1',
#  'pos_x': '11.417197',
#  'pos_y': '37.027515',
#  'pos_z': '7.4369965',
#  'orien_x': '-0.0',
#  'orien_y': '0.0',
#  'orien_z': '0.9999752',
#  'orien_w': '0.0070461035',
#  'linear_acc_x': '',
#  'linear_acc_y': '',
#  'linear_acc_z': '',
#  'linear_vel_x': '',
#  'linear_vel_y': '',
#  'linear_vel_z': '',
#  'angular_acc_x': '',
#  'angular_acc_y': '',
#  'angular_acc_z': '',
#  'angular_vel_x': '',
#  'angular_vel_y': '',
#  'angular_vel_z': '',
#  'pkts_buffered': 0.0,
#  'bit_rate': 4949598.859792932,
#  'chosen_ue': 'simulation_pedestrian1',
#  'packets': 14627.0,
#  'channel_mag': array(0.00890296)}

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Categorical
from torch.autograd import Variable

import tqdm

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import copy
import tqdm

import gym

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

In [3]:
device = torch.device("cpu")

# Gym Environment

In [4]:
import caviar_tools
from beamselect_env import BeamSelectionEnv

### Hyper Params

In [5]:
reward_type = 'test'    # 'test' or 'train'
epi = [0,10] #[start,end] 
epi_val = [490,499]


gym_env = BeamSelectionEnv(epi, reward_type)
gym_env_val = BeamSelectionEnv(epi_val)

n_steps = caviar_tools.linecount(epi)
n_steps_val = caviar_tools.linecount(epi_val)



In [6]:
n_steps

68700

Observation Space : X,Y,Z,pkts_dropped,pkts_transmitted,pkts_buffered,bit_rate

Action Space : [3,64] -> [UE,Possible beams]

## Replay Memory

In [7]:
# Transition = namedtuple('Transition',
#                         ('state', 'action', 'next_state', 'reward'))


# class ReplayMemory(object):

#     def __init__(self, capacity):
#         self.memory = deque([],maxlen=capacity)

#     def push(self, *args):
#         """Save a transition"""
#         self.memory.append(Transition(*args))

#     def sample(self, batch_size):
#         return random.sample(self.memory, batch_size)

#     def __len__(self):
#         return len(self.memory)

## DQN

In [8]:
class DQN(nn.Module):

    def __init__(self, inputs:int=7, outputs:int=64*3):
        super().__init__()

        self.linear = nn.Sequential(
            self.create_linear(inputs,16),
            self.create_linear(16, 32),
            self.create_linear(32,64),
            self.create_linear(64,256),
            self.create_linear(256,outputs)
        )


    
    def create_linear(self,inp:int,out:int)-> nn.Module:
        return nn.Sequential(
            nn.Linear(inp,out),
            nn.ELU()
            # nn.BatchNorm1d(out)
        )
    
    def forward(self, x):
        x = x.to(device)
        x = self.linear(x)
        out = nn.Softmax(dim=-1)(x)
        return out

        

In [9]:
class Policy():
    def __init__(self):
         # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor(), requires_grad=True) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

In [10]:
policy = Policy()

In [11]:
tmp = DQN()
sample = torch.rand((1,7))
tmp(sample).shape

torch.Size([1, 192])

## Hyperparams

In [12]:
BATCH_SIZE = 128
GAMMA = 0.99
# EPS_START = 0.9
# EPS_END = 0.3

# # It depends on overall number of steps, basic intitution is that
# # once steps_done == EPS_DECAY then the probablity of choosing 
# # random action is 33%; considering EPS_END is zero
# # As for ep = [0,10]; approx ep is 80k therefore exploration can be reduced to 33% around 50k
# # Also because of this factor smoothed accuracy matters more for training then seeing the average
# EPS_DECAY = n_steps*0.3

TARGET_UPDATE = 1000
VAL_STEP = 30000

## Action

In [13]:
policy_net = DQN(11, 192).to(device)
# target_net = DQN(11, 192).to(device)
# target_net.load_state_dict(policy_net.state_dict())
# target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
# memory = ReplayMemory(10000)

n_actions = 64*3
steps_done = 0

def select_action(state:torch.Tensor,val:bool=False):
    if not val:
        with torch.no_grad():
            action_array = policy_net(state)
            action_dist = Categorical(action_array)
            action_value = action_dist.sample()

            # Add log probability of our chosen action to our history    
            if policy.policy_history.dim() != 0:
                policy.policy_history = torch.cat([policy.policy_history, action_dist.log_prob(action_value)])
            else:
                policy.policy_history = (action_dist.log_prob(action_value))
            return action_value
    else:
        with torch.no_grad():
            action = torch.max(policy_net(state),dim=1)
            return action.indices

    # if val:
    #     with torch.no_grad():
    #         # t.max(1) will return largest column value of each row.
    #         # second column on max result is index of where max element was
    #         # found, so we pick action with the larger expected reward.
    #         flattened_action = policy_net(state).max(dim = 1).indices
    #         return torch.tensor([[flattened_action]], device=device, dtype=torch.long)
    # else:
    #     global steps_done
    #     sample = random.random()
    #     eps_threshold = EPS_END + (EPS_START - EPS_END) * \
    #         math.exp(-1. * steps_done / EPS_DECAY)
    #     steps_done += 1
    #     if sample > eps_threshold:
    #         with torch.no_grad():
    #             # t.max(1) will return largest column value of each row.
    #             # second column on max result is index of where max element was
    #             # found, so we pick action with the larger expected reward.
    #             flattened_action = policy_net(state).max(dim = 1).indices
    #             return torch.tensor([[flattened_action]], device=device, dtype=torch.long)
    #     else:
    #         flattened_action = random.randrange(n_actions)
    #         return torch.tensor([[flattened_action]], device=device, dtype=torch.long)


In [14]:
# Tensorboard
log_dir = './mini_logs/policy_grad' 

writer = SummaryWriter(log_dir=log_dir)

## Optimize Model

In [15]:
def update_policy(n_step:int):
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + GAMMA * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards, requires_grad=True)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    writer.add_scalar('Loss',loss.item(),n_step)
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.item())
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor(), requires_grad=True)
    policy.reward_episode= []

In [16]:
# def optimize_model():
#     if len(memory) < BATCH_SIZE:
#         return
#     transitions = memory.sample(BATCH_SIZE)
#     # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
#     # detailed explanation). This converts batch-array of Transitions
#     # to Transition of batch-arrays.
#     batch = Transition(*zip(*transitions))

#     # Compute a mask of non-final states and concatenate the batch elements
#     # (a final state would've been the one after which simulation ended)
#     non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
#                                           batch.next_state)), device=device, dtype=torch.bool)
#     non_final_next_states = torch.cat([s for s in batch.next_state
#                                                 if s is not None])
#     state_batch = torch.cat(batch.state)
#     action_batch = torch.cat(batch.action)
#     reward_batch = torch.cat(batch.reward)

#     # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
#     # columns of actions taken. These are the actions which would've been taken
#     # for each batch state according to policy_net
#     state_action_values = policy_net(state_batch).gather(1, action_batch)

#     # Compute V(s_{t+1}) for all next states.
#     # Expected values of actions for non_final_next_states are computed based
#     # on the "older" target_net; selecting their best reward with max(1)[0].
#     # This is merged based on the mask, such that we'll have either the expected
#     # state value or 0 in case the state was final.
#     next_state_values = torch.zeros(BATCH_SIZE, device=device)
#     next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
#     # Compute the expected Q values
#     expected_state_action_values = (next_state_values * GAMMA) + reward_batch
#     # Compute Huber loss
#     criterion = nn.SmoothL1Loss()
#     loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

#     # Optimize the model
#     optimizer.zero_grad()
#     loss.backward()
#     for param in policy_net.parameters():
#         param.grad.data.clamp_(-1, 1)
#     optimizer.step()

## Training loop

In [17]:
def val(train_step:int,gym_env_val:gym.Env,n_steps_val:int=n_steps_val,
        writer:SummaryWriter=writer):

    state = torch.zeros((1,11), dtype=torch.float32)
    running_reward = 0.0
    for episode in tqdm.tqdm_notebook(range(n_steps_val),desc='Val'):
        # Reset environment and record the starting state
        done = False       

        action = select_action(state, val = 'True')
        # Step through environment using chosen action
        state, reward, done, _ = gym_env_val.step([action.item()//64,action.item()%64])

        state = state.astype(np.float32).reshape(1, state.shape[0])
        state = torch.tensor(state)
        running_reward +=reward
        
        # if done:
        #     break
    
    writer.add_scalar('Val Reward',running_reward/n_steps_val,train_step)
    gym_env_val.close()
    print(f'Validation Reward {running_reward/n_steps_val}')


In [18]:
state = torch.zeros((1,11), dtype=torch.float32)
ovr_reward = 0.0
for episode in tqdm.tqdm_notebook(range(n_steps),desc='Train'):
     # Reset environment and record the starting state
    done = False       

    action = select_action(state)

    # Step through environment using chosen action
    state, reward, done, _ = gym_env.step([action.data[0].item()//64,action.data[0].item()%64])

    state = state.astype(np.float32).reshape(1, state.shape[0])
    state = torch.tensor(state)

    # Save reward
    policy.reward_episode.append(reward)
    writer.add_scalar('Rewards',reward.item(),episode)
    ovr_reward+=reward.item()
    

    if (episode + 1)%10==0:
        update_policy(episode)

    if (episode)%VAL_STEP == 0 or (episode == n_steps-1):
        val(episode, copy.deepcopy(gym_env_val))


writer.add_hparams(
    {
        'BATCH_SIZE' : BATCH_SIZE,
        'GAMMA' : GAMMA,
        'TARGET_UPDATE' :TARGET_UPDATE
    },
    {
        'Overall Reward':ovr_reward,
        'Average Reward': ovr_reward/n_steps
    }
)

print(f'Overall Train reward = {ovr_reward:.2f}. ' \
    f'Average Reward = {ovr_reward/n_steps:.4f}')
gym_env.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


Train:   0%|          | 0/68700 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


Val:   0%|          | 0/62598 [00:00<?, ?it/s]

Validation Reward -0.16733134193588364


Val:   0%|          | 0/62598 [00:00<?, ?it/s]

Validation Reward -0.1673658889273159


Val:   0%|          | 0/62598 [00:00<?, ?it/s]

Validation Reward -0.1672124497354775


Val:   0%|          | 0/62598 [00:00<?, ?it/s]

Validation Reward -0.16717067486864717
Overall Train reward = 0.00. Average Reward = 0.0000
