In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import A2C
from stable_baselines.common.env_checker import check_env
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.evaluation import evaluate_policy
import tensorflow as tf
from stable_baselines.common.callbacks import BaseCallback
from stable_baselines.bench.monitor import Monitor
from stable_baselines.common.callbacks import EveryNTimesteps

In [None]:
import gym
import w_mac
from collections import defaultdict
import matplotlib as plt
import networkx as nx
import dill
from copy import deepcopy
from ray import tune

In [None]:

# data = [(0,2),(0,1),(1,2),(2,3),(2,4),(3,4)]
data = [(0,2),(0,1),(0,3),(1,2),(1,3),(2,3),(2,4),(3,4),(5,2),(5,3),(5,4),(5,6),(6,7),(6,8),(7,8),(8,9),(9,10),(4,10)]#(4,6),(5,10),(6,10),(9,6),(8,10)]
d = defaultdict(list)
#data = [(0,2),(0,1),(0,3),(1,2),(1,3),(2,3),(2,4),(3,4),(5,2),(5,3),(5,4)]
for node, dest in data:
    d[node].append(dest)
print(d)

G = nx.Graph()
for k,v in d.items():
    for vv in v:
        G.add_edge(k,vv)
nx.draw_networkx(G)

In [None]:
env = gym.make('wmac-graph-v0',graph=G)
#env = gym.make('wmac-tune-v0')
env.reset()

In [None]:
check_env(env)

In [None]:
assert(dill.pickles(env))

In [None]:
"""
def evaluate_objective(config):
    tune_env = deepcopy(env)
    tune_agent = A2C("MlpPolicy", tune_env, learning_rate= config["lr"] )
    tune_agent.learn(total_timesteps=10)
    
    mean_reward, std_reward = evaluate_policy(tune_agent, env, n_eval_episodes=10, render=False,
                                                           deterministic=True,
                                                           return_episode_rewards=False)
    print("mr",mean_reward,"sd", std_reward)
    tune.report(mean_reward=mean_reward)
"""

In [None]:
import numpy as np
from typing import Union
from ray.tune import report
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.vec_env import DummyVecEnv, VecEnv, sync_envs_normalization

class OptimizationCallback(BaseCallback):

    def __init__(self, eval_env: Union[gym.Env, VecEnv],
                 n_eval_episodes: int = 5,
                 deterministic: bool = True,
                 verbose=0):
        super(OptimizationCallback, self).__init__(verbose)
        self.eval_env = deepcopy(eval_env)
        self.eval_env.reset()
        self.n_eval_episodes = n_eval_episodes
        self.deterministic = deterministic

    def _on_step(self):
        #sync_envs_normalization(self.training_env, self.eval_env)

        episode_rewards, episode_lengths = evaluate_policy(self.model, self.eval_env,
                                                           n_eval_episodes=self.n_eval_episodes,
                                                           render=False,
                                                           deterministic=self.deterministic,
                                                           return_episode_rewards=True)

        mean_reward= np.mean(episode_rewards)
        mean_ep_length = np.mean(episode_lengths)
        packet_lost = self.eval_env.get_packet_lost()

        report(
            mean_reward=mean_reward,
            mean_ep_length=mean_ep_length,
            packet_lost = packet_lost
        )

In [None]:
"""
import ray

object_store_memory = int(0.3 * ray.utils.get_system_memory() // 10 ** 9 * 10 ** 9)
ray.init(
            ignore_reinit_error=True,
            num_cpus = 10,
            local_mode = True,
            _plasma_directory="/tmp",
            object_store_memory=object_store_memory,
        )
"""

In [None]:
from ray import tune
from ray.tune.suggest.ax import AxSearch
from ray.tune.schedulers import ASHAScheduler
parameters=[
    {"name": "learning_rate", "type": "range", "bounds": [3e-5, 3e-3]},
    {"name": "gamma", "type": "range", "bounds": [0.99, 1.0]},
    {"name": "max_grad_norm", "type": "range", "bounds": [0.3, 7.0]}
]   

        
def evaluate_objective(config):
    tune_env = deepcopy(env)
    tune_monitor = OptimizationCallback(tune_env, 5, True)
    monitor_callback = EveryNTimesteps(n_steps=10000, callback=tune_monitor)

    
    
    tune_agent = A2C("MlpPolicy", tune_env, 
                     gamma = config["gamma"],
                     max_grad_norm = config["max_grad_norm"],
                     learning_rate= config["learning_rate"]
                    )#**config)
    tune_agent.learn(total_timesteps=1000000, callback=monitor_callback)
    
ax_search = AxSearch(space=parameters, metric="mean_reward", mode = "max")

asha_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    metric='mean_reward',
    mode='max',
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1)

analysis = tune.run(evaluate_objective,
         num_samples=5,
         search_alg=ax_search,
         scheduler=asha_scheduler,
         resources_per_trial={"cpu": 8}
        )

In [None]:
print(analysis.get_best_config(metric="mean_reward", mode = "max"))
