In [14]:
import math
import random
import numpy as np
from collections import namedtuple
#from itertools import count
import scipy.io as sio

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


GOAL = 250
NUM_FILES = 1
DELTA_VALUE = 5
NUM_STATE = NUM_FILES*(DELTA_VALUE+1)
A_HAT = 50
N_USERS = 2
ETA_value = 5
RANDOM_SEED = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
random.seed(RANDOM_SEED)
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(nn.Module):

    def __init__(self, inputs, outputs):
        super(DQN, self).__init__()
        self.inlayer = nn.Linear(inputs, 64)
        self.hidlayer1 = nn.Linear(64, 32)
        self.hidlayer2 = nn.Linear(32, 16)
        self.outlayer = nn.Linear(16, outputs)

    def forward(self, x):
        x = F.relu(self.inlayer(x.view(x.size(0), -1)))
        x = F.relu(self.hidlayer1(x))
        x = F.relu(self.hidlayer2(x))
        return self.outlayer(x)

cuda


In [15]:
BATCH_SIZE = 1000
EPS_START = 0.9
EPS_END = 0.0
EPS_DECAY = 100
TARGET_UPDATE = 10

n_actions = NUM_FILES+1

policy_net = DQN(NUM_STATE, n_actions).to(device)
target_net = DQN(NUM_STATE, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(100000)

In [16]:
steps_done = 0
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

In [17]:
def optimize_model(reference_state):
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))

    state_batch = torch.cat(batch.state)
    next_state_batch = torch.cat(batch.next_state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    
 
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values = target_net(next_state_batch).max(1)[0].detach() 
    expected_state_action_values = next_state_values + reward_batch - target_net(reference_state).max(1)[0].detach()
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    optimizer.zero_grad()
    loss.backward()
    
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [18]:
class AoI_simulator:

    def __init__(self, N_files=1, A_hat=50, eta=20, Delta= 4, P_arrive= 0.3, N_users = 4):
        self.N_files = N_files
        self.A_hat = A_hat 
        self.eta = eta
        self.Delta = Delta
        self.P_arrive = P_arrive
        self.N_users = N_users
        self.current_AoI = np.ones([self.N_files,1])
        self.current_queue = np.zeros([self.N_files, self.Delta])
        
        for i_file in range(self.N_files):
            self.current_queue[i_file,-1] = np.random.binomial(self.N_users, self.P_arrive)            
        self.state = np.concatenate((self.current_AoI.reshape((1,-1)), self.current_queue.reshape((1,-1))), axis=1)

  
    def step(self, action):        
        reward = 0 
        next_AoI = np.zeros_like(self.current_AoI)
        next_queue = np.zeros_like(self.current_queue)

        for i_file in range(self.N_files):
            next_queue[i_file,:-1] = self.current_queue[i_file,1:]
            next_queue[i_file, -1] = np.random.binomial(self.N_users, self.P_arrive)
            reward = reward + self.current_AoI[i_file]*self.current_queue[i_file,0]/(self.P_arrive*N_USERS)

            if action == i_file+1:
                reward = reward + self.eta
                next_AoI[i_file] = 1 
            else:
                next_AoI[i_file] = min(self.A_hat, self.current_AoI[i_file]+1)
        
        self.current_AoI = next_AoI
        self.current_queue = next_queue
        
        self.state = np.concatenate((self.current_AoI.reshape((1,-1)), self.current_queue.reshape((1,-1))), axis=1)
        
        return (self.state, -reward)



In [19]:
num_episodes = 200
T = 300
total_reward = 0
epi_val = np.zeros(num_episodes)
for i_episode in range(num_episodes):
    total_reward = 0
    aoi_sim = AoI_simulator(N_files=NUM_FILES, Delta=DELTA_VALUE, A_hat=A_HAT, P_arrive = 0.3, N_users = N_USERS, eta = ETA_value)
    state = aoi_sim.state.squeeze()
    
    state = torch.tensor([state], dtype=torch.float32)
    state = state.to(device)
    reference_state = state
    
    for t in range(T):
        # Select and perform an action
        action = select_action(state)
        next_state, reward  = aoi_sim.step(action.item())
        reward = reward.squeeze()
        next_state = next_state.squeeze()
        total_reward += reward
        next_state = torch.tensor([next_state], dtype=torch.float32)
        next_state = next_state.to(device)
        reward = torch.tensor([1*reward],dtype=torch.float32, device=device)
        

        memory.push(state, action, next_state, reward)
        state = next_state
        optimize_model(reference_state) 
        
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
        print(total_reward/(T*NUM_FILES))
    epi_val[i_episode] = total_reward/(T*NUM_FILES)
print('Complete')


-4.844444444444448
-21.094444444444445
-2.827777777777776
-2.7833333333333337
-3.1055555555555556
-2.822222222222222
-2.938888888888888
-2.9333333333333322
-2.86111111111111
-3.2388888888888903
-3.183333333333333
-2.938888888888888
-3.0888888888888886
-3.1333333333333324
-2.994444444444444
-3.2277777777777774
-3.222222222222223
-3.161111111111111
-3.005555555555556
-2.8722222222222222
Complete


In [20]:
T = 10000
total_reward = 0
AoI_vec = np.zeros([NUM_FILES, T])
request_vec = np.zeros(T)
update_vec = np.zeros(T)


total_reward = 0
aoi_sim = AoI_simulator(N_files=NUM_FILES, Delta=DELTA_VALUE, A_hat=A_HAT, P_arrive = (0.3), N_users = N_USERS, eta = ETA_value)
state = aoi_sim.state.squeeze()
    # Initialize the environment and state with random starting action
state = torch.tensor([state], dtype=torch.float32) # numpy array to torch tensor
state = state.to(device)
    
    
for t in range(T):
        # Select and perform an action
    action = select_action(state)
    next_state,reward  = aoi_sim.step(action.item())
    next_state = next_state.squeeze()
    next_state = torch.tensor([next_state], dtype=torch.float32)
    next_state = next_state.to(device)
    state = next_state
    AoI_vec[:,t] = aoi_sim.current_AoI.squeeze()*aoi_sim.current_queue[:,0].squeeze()
    request_vec[t] = np.sum(aoi_sim.current_queue[:,0])
    update_vec[t] = action
    

In [21]:
average_aoi = np.sum(AoI_vec)/np.sum(request_vec)
mu = np.sum(update_vec>0)/T
total_cost = average_aoi + ETA_value*mu

In [23]:
adict = {}
adict['convergence_vec'] = epi_val
adict['total_cost'] = total_cost
adict['average_aoi'] = average_aoi
adict['mu'] = mu

file_name = 'AoI_Eta%d_Random%d_Delta%d_N_User%d.mat'%(ETA_value,RANDOM_SEED,DELTA_VALUE,N_USERS)
sio.savemat(file_name, adict)