In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import tqdm

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import copy
from torch.multiprocessing import Pipe
import pandas as pd

import gym
import caviar_tools
from beamselect_env import BeamSelectionEnv

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()
device = torch.device("cpu")

In [2]:
epi_val = [500,509]
gym_env_ind_val = list()
for i in range(epi_val[0],epi_val[1]+1):
    gym_env_ind_val.append(BeamSelectionEnv([i,i]))

n_steps_epi_val = list()
for i in range(epi_val[0],epi_val[1]+1):
    n_steps_epi_val.append(caviar_tools.linecount([i,i]))

# n_steps = sum(n_steps_epi)
n_steps_val = sum(n_steps_epi_val)



In [3]:
from agents_cur import *
from utils_cur import *

import numpy as np
import pickle

In [4]:
train_method = 'ICM'
env_id = None #BreakoutNoFrameskip-v4
env_type = 'beamselect'

In [5]:
model_path = './model_state_dict/model_curiosity'
icm_path = './model_state_dict/icm_curiosity'

In [6]:
lam = 0.95
num_worker = 1

num_step = int(512)

ppo_eps = float(0.1)
epoch = int(3)
mini_batch = int(8)
BATCH_SIZE = int(num_step * num_worker / mini_batch) #16
learning_rate = float(1e-4)
entropy_coef = float(0.001)
gamma = float(0.8)
eta = float(1)

clip_grad_norm = float(0.5)

pre_obs_norm_step = int(3000)#int(10000) Not much effect

HISTORY_SIZE = BATCH_SIZE
STATES_USED = 13

In [7]:
input_size = [BATCH_SIZE,STATES_USED]  
output_size = 192 #64*3

In [8]:
f = open('obs_rms.pkl', 'rb')
obs_rms = pickle.load(f)

In [9]:
agent = ICMAgent

agent = agent(
        input_size,
        output_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=BATCH_SIZE,
        ppo_eps=ppo_eps,
        eta=eta,
        use_cuda=False,
        use_gae=False,
        use_noisy_net=False
    )

In [10]:
states = np.zeros([1, BATCH_SIZE, 1,STATES_USED])

In [11]:
agent.model.load_state_dict(torch.load(model_path))
agent.icm.load_state_dict(torch.load(icm_path))

<All keys matched successfully>

In [12]:
Transition = namedtuple('Transition',
                        ('state'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size:int=BATCH_SIZE):
        return random.sample(self.memory, BATCH_SIZE)

    def __len__(self):
        return len(self.memory)

In [13]:
history = ReplayMemory(HISTORY_SIZE)

for i in range(HISTORY_SIZE):
    history.push(np.zeros((STATES_USED, )))

In [14]:
def run(action,env:gym.envs,history:ReplayMemory,df):
    s, reward, done, info = env.step([action//64, action%64])
    
    df = df.append({
        'UE':action//64,
        'Beam':action%64,
        'Reward':reward
    },ignore_index=True)
    
    # print(type(s), s.shape)
    history.push(s.astype(np.float))
    
    return [np.array(history.sample(BATCH_SIZE)), reward, done, done, reward,df]

In [15]:
def val(epi_val:list,gym_env_ind:list=None):
    
    with torch.no_grad():
        
        history = ReplayMemory(HISTORY_SIZE)
        for i in range(HISTORY_SIZE):
            history.push(np.zeros((STATES_USED, )))
            
        f = open('obs_rms.pkl', 'rb')
        obs_rms = pickle.load(f)
        
        rall = 0
        rd = False
        intrinsic_reward_list = []
        states = np.zeros([1, BATCH_SIZE, 1,STATES_USED])
        
        ovr_val = 0
        csv_pd = list()
        for episode in tqdm.notebook.tqdm(range(epi_val[1]-epi_val[0]+1),desc='Val'):
            epi_pd = pd.DataFrame(columns=['UE','Beam','Reward'])
            
            if len(gym_env_ind) == 0:
                env =  BeamSelectionEnv([episode,episode])
            else:
                env = (gym_env_ind[episode])
                
            rall=0
            for steps in range(n_steps_epi_val[episode]):
                
                actions, value, policy = agent.get_action(
                    (states - obs_rms.mean) / np.sqrt(obs_rms.var)
                )
                
                next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [],\
                    [], [], []
                
                for action in actions:
                    s, r, d, rd, lr,epi_pd = run(action,env,history,epi_pd)
                    rall += r
                    next_states.append(s)
                
                next_states = np.stack(next_states)
                states = next_states[:, :, :, :]
                
            print(f'Running avg for episode {epi_val[0]+episode} is {rall/steps}')
            ovr_val +=rall
            csv_pd.append(epi_pd)
        
        print(f'Overall mean reward is {ovr_val/n_steps_val} ')
        return csv_pd

In [16]:
csv_pd = val(epi_val,gym_env_ind_val)

Val:   0%|          | 0/10 [00:00<?, ?it/s]

Running avg for episode 500 is -0.1411112611270135
Running avg for episode 501 is -0.1761471468740525
Running avg for episode 502 is -0.1503640912378339
Running avg for episode 503 is -0.15393024553394868
Running avg for episode 504 is -0.14676127333914663
Running avg for episode 505 is -0.17192274101291485
Running avg for episode 506 is -0.1426154395151532
Running avg for episode 507 is -0.14492012816123556
Running avg for episode 508 is -0.16391595449056262
Running avg for episode 509 is -0.14430467986662124
Overall mean reward is -0.15302723135354662 


In [20]:
import os
os.makedirs('./eval',exist_ok=True)
for i in range(len(n_steps_epi_val)):
    csv_pd[i].to_csv(f'./eval/{i}.csv')

In [21]:
len(csv_pd[0])

7326