In [None]:
# {'pkts_dropped': 0.0,
#  'pkts_transmitted': 75.0,
#  'timestamp': '1626810037874447104',
#  'obj': 'simulation_pedestrian1',
#  'pos_x': '11.417197',
#  'pos_y': '37.027515',
#  'pos_z': '7.4369965',
#  'orien_x': '-0.0',
#  'orien_y': '0.0',
#  'orien_z': '0.9999752',
#  'orien_w': '0.0070461035',
#  'linear_acc_x': '',
#  'linear_acc_y': '',
#  'linear_acc_z': '',
#  'linear_vel_x': '',
#  'linear_vel_y': '',
#  'linear_vel_z': '',
#  'angular_acc_x': '',
#  'angular_acc_y': '',
#  'angular_acc_z': '',
#  'angular_vel_x': '',
#  'angular_vel_y': '',
#  'angular_vel_z': '',
#  'pkts_buffered': 0.0,
#  'bit_rate': 4949598.859792932,
#  'chosen_ue': 'simulation_pedestrian1',
#  'packets': 14627.0,
#  'channel_mag': array(0.00890296)}

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Categorical
from torch.autograd import Variable

import tqdm

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import copy
import tqdm

import gym

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

In [None]:
device = torch.device("cpu")

# Gym Environment

In [None]:
import caviar_tools
from beamselect_env import BeamSelectionEnv

### Hyper Params

In [None]:
reward_type = 'test'    # 'test' or 'train'
epi = [0,10] #[start,end] 
epi_val = [490,499]


gym_env = BeamSelectionEnv(epi, reward_type)
gym_env_val = BeamSelectionEnv(epi_val)

n_steps_per_epi = []

for i in range(epi[1]+1):
    n_steps_per_epi.append(caviar_tools.linecount([i, i]))

for i in range(1, len(n_steps_per_epi)):
    n_steps_per_epi[i] += n_steps_per_epi[i-1]

n_steps = n_steps_per_epi[-1]
n_steps_val = caviar_tools.linecount(epi_val)

In [None]:
print(n_steps, n_steps_per_epi)

In [None]:
n_steps

Observation Space : X,Y,Z,pkts_dropped,pkts_transmitted,pkts_buffered,bit_rate

Action Space : [3,64] -> [UE,Possible beams]

## Replay Memory

In [None]:
# Transition = namedtuple('Transition',
#                         ('state', 'action', 'next_state', 'reward'))


# class ReplayMemory(object):

#     def __init__(self, capacity):
#         self.memory = deque([],maxlen=capacity)

#     def push(self, *args):
#         """Save a transition"""
#         self.memory.append(Transition(*args))

#     def sample(self, batch_size):
#         return random.sample(self.memory, batch_size)

#     def __len__(self):
#         return len(self.memory)

## A2C

In [None]:
class A2C(nn.Module):

    def __init__(self, inputs:int=7, outputs:int=64*3):
        super().__init__()

        self.affine =  nn.Sequential(
            self.create_linear(inputs,16),
            self.create_linear(16, 32),
            self.create_linear(32,64),
            self.create_linear(64,256)
        )
        
        self.actor_linear = nn.Sequential(
            self.create_linear(256,outputs)
        )

        self.critic_linear = nn.Sequential(
            self.create_linear(256,1)
        )


    
    def create_linear(self,inp:int,out:int)-> nn.Module:
        return nn.Sequential(
            nn.Linear(inp,out),
            nn.ELU()
            # nn.BatchNorm1d(out)
        )
    
    def forward(self, x):
        x = x.to(device)
        x = self.affine(x)

        # Actor - A2C implementation
        x_action = self.actor_linear(x)
        x_action = nn.Softmax(dim=-1)(x)

        #  Critic
        x_critic = self.critic_linear(x) 

        return x_action,x_critic

        

## Storage of actions and policy

In [None]:
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [None]:
class Policy():
    def __init__(self):
         # Episode policy and reward history 
        self.rewards = []
        self.saved_actions = []

In [None]:
policy = Policy()

In [None]:
tmp = A2C()
sample = torch.rand((1,7))
out = tmp(sample)
print(out[0].shape,out[1].shape)

## Hyperparams

In [None]:
BATCH_SIZE = 128
GAMMA = 0.99
# EPS_START = 0.9
# EPS_END = 0.3

# # It depends on overall number of steps, basic intitution is that
# # once steps_done == EPS_DECAY then the probablity of choosing 
# # random action is 33%; considering EPS_END is zero
# # As for ep = [0,10]; approx ep is 80k therefore exploration can be reduced to 33% around 50k
# # Also because of this factor smoothed accuracy matters more for training then seeing the average
# EPS_DECAY = n_steps*0.3

TARGET_UPDATE = 1000
VAL_STEP = 60000

## Action

In [None]:
policy_net = A2C(11, 192).to(device)

optimizer = optim.Adam(policy_net.parameters(),lr=3e-2)

n_actions = 64*3
steps_done = 0

def select_action(state:torch.Tensor,val:bool=False):
    if not val:
        action_array, state_value = policy_net(state)
        action_dist = Categorical(action_array)
        action_value = action_dist.sample()

        # Add log probability of our chosen action to our history    
        # if policy.policy_history.dim() != 0:
        policy.saved_actions.append(SavedAction(action_dist.log_prob(action_value), state_value))
        # else:
        #     policy.policy_history = (action_dist.log_prob(action_value))
        return action_value
    else:
        with torch.no_grad():
            action = torch.max(policy_net(state),dim=1)
            return action.indices


In [None]:
# Tensorboard
log_dir = './mini_logs/AC' 

writer = SummaryWriter(log_dir=log_dir)

## Optimize Model

In [None]:
def update_policy(n_step:int):

    """
    Training code. Calculates actor and critic loss and performs backprop.
    """
    R = 0
    saved_actions = policy_net.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = [] # list to save the true values

    # calculate the true value using rewards returned from the environment
    for r in policy.rewards[::-1]:
        # calculate the discounted value
        R = r + GAMMA * R
        returns.insert(0, R)

    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + np.finfo(np.float32).eps)

    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()

        # calculate actor (policy) loss 
        policy_losses.append(-log_prob * advantage)

        # calculate critic (value) loss using L1 smooth loss
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))

    # reset gradients
    optimizer.zero_grad()

    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

    writer.add_scalar('Loss',loss.item(),n_step)
    
    # perform backprop
    loss.backward()
    optimizer.step()

    # reset rewards and action buffer
    del policy.rewards[:]
    del policy.saved_actions[:]

## Training loop

In [None]:
def val(train_step:int,gym_env_val:gym.Env,n_steps_val:int=n_steps_val,
        writer:SummaryWriter=writer):

    state = torch.zeros((1,11), dtype=torch.float32)
    running_reward = 0.0
    for episode in tqdm.tqdm_notebook(range(n_steps_val),desc='Val'):
        # Reset environment and record the starting state
        done = False       

        action = select_action(state, val = 'True')
        # Step through environment using chosen action
        state, reward, done, _ = gym_env_val.step([action.item()//64,action.item()%64])

        state = state.astype(np.float32).reshape(1, state.shape[0])
        state = torch.tensor(state)
        running_reward +=reward
        
        # if done:
        #     break
    
    writer.add_scalar('Val Reward',running_reward/n_steps_val,train_step)
    gym_env_val.close()
    print(f'Validation Reward {running_reward/n_steps_val}')


In [None]:
state = torch.zeros((1,11), dtype=torch.float32)
ovr_reward = 0.0
episode_cnt = 0
for episode in tqdm.tqdm_notebook(range(n_steps),desc='Train'):
     # Reset environment and record the starting state
    done = False       

    action = select_action(state)

    # Step through environment using chosen action
    state, reward, done, _ = gym_env.step([action.data[0].item()//64,action.data[0].item()%64])

    state = state.astype(np.float32).reshape(1, state.shape[0])
    state = torch.tensor(state)

    # Save reward
    policy.reward.append(reward)
    writer.add_scalar('Rewards',reward.item(),episode)
    ovr_reward+=reward.item()
    

    if episode == n_steps_per_epi[episode_cnt]:
        update_policy(episode)
        episode_cnt += 1

    if (episode)%VAL_STEP == 0 or (episode == n_steps-1):
        val(episode, copy.deepcopy(gym_env_val))


writer.add_hparams(
    {
        'BATCH_SIZE' : BATCH_SIZE,
        'GAMMA' : GAMMA,
        'TARGET_UPDATE' :TARGET_UPDATE
    },
    {
        'Overall Reward':ovr_reward,
        'Average Reward': ovr_reward/n_steps
    }
)

print(f'Overall Train reward = {ovr_reward:.2f}. ' \
    f'Average Reward = {ovr_reward/n_steps:.4f}')
gym_env.close()