## This notebook trains a defender agent with PPO

This notebook offers example code on how to train a defender agent on the ESS environment with PPO. Note that for the code to work correctly, you'll need the modified versions of gym and OpenAI baselines installed (we recommend on a virtual environment). 

Links to modified gym/baselines:


https://github.com/rubai5/baselines


https://github.com/rubai5/gym


### Imports

In [2]:
import os, sys
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
%matplotlib inline
from datetime import datetime
import pickle

from mpi4py import MPI
import os.path as osp
import gym, logging
from baselines import logger

from baselines.ppo1 import pposgd_simple_generalization, mlp_policy
import baselines.common.tf_util as U
from copy import deepcopy

### Game Paramters
The ESS game has a huge number of possible states. The gym environment has some ways of sampling from these states, and here, we set the parameters to mix the distributions as desired

In [3]:
# parameters
name = "ErdosGame-v0"
seed = 101

# game specific parameters
K = 15
potential = 0.9

# sampling probabilities, must sum to 1
unif_prob = 0.0
geo_prob = 1.0
diverse_prob = 0.0
state_unif_prob = 0.0 # can only use if K is small < 10 -- try to use previous methods instead

assert (unif_prob + geo_prob + diverse_prob + state_unif_prob == 1), "probabilites don't sum to 1"

# attacker plays adversarially?
adverse_set_prob = 0.0
disj_supp_prob = 0.5

# high one
high_one_prob = 0.0

# upper limits for start state sampling
geo_high = K - 2
unif_high = max(3, K-3)

# putting into names_and_args argument
names_and_args = {"K" : K, 
                  "potential" : potential, 
                  "unif_prob" : unif_prob, 
                  "geo_prob" : geo_prob,
                   "diverse_prob" : diverse_prob, 
                  "state_unif_prob" : state_unif_prob, 
                  "high_one_prob" : high_one_prob, 
                  "adverse_set_prob" :adverse_set_prob, 
                  "disj_supp_prob" : disj_supp_prob, 
                  "geo_high" : geo_high, 
                  "unif_high" :unif_high }

### Model Paramters

In [4]:
HID_SIZE=300
NUM_HID_LAYERS=2

### Policy Net, Train and Test function

In [5]:
# functions to initialize environment and train model

def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 
                                    hid_size=HID_SIZE, num_hid_layers=NUM_HID_LAYERS)
    
def make_policies(ob_space, ac_space, policy_func):
    pi = policy_func("pi", ob_space, ac_space)
    oldpi = policy_func("old_pi", ob_space, ac_space)
    return pi, oldpi

def train(env_train, pi, oldpi, names_and_args, num_timesteps, test_envs):
    #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    #set_global_seeds(workerseed)
    
    env_train.reset()
    if test_envs:
        for test_env in test_envs:
            test_env.reset()
    
    #env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)
  

    policy_net, info = pposgd_simple_generalization.learn(env_train, pi, oldpi,
        max_timesteps=num_timesteps,
        timesteps_per_batch=100,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=50,
        gamma=0.99, lam=0.95,
        schedule='linear',
        test_envs=test_envs
    )

    return policy_net, info


def test_policy(num_rounds, policy_net, test_env):
    total_reward = 0.0
    horizon = test_env.observation_space.K*num_rounds # generate around num_rounds draws
    seg_gen = pposgd_simple_generalization.traj_segment_generator(policy_net, test_env, horizon, stochastic=True)
    
    # call generator
    results = seg_gen.__next__()
    mean_reward = np.mean(results["ep_rets"])
    actions = results["ac"]
    labels = results["label"]
    mean_correct_actions = compute_correct_actions(labels, actions)
    return mean_reward, mean_correct_actions

def compute_correct_actions(label, ac):
    count = 0
    idxs = np.all((label == [1,1]), axis=1)
    count += np.sum(idxs)
    new_label = label[np.invert(idxs)]
    new_ac = ac[np.invert(idxs)]
    count += np.sum((new_ac == np.argmax(new_label, axis=1)))
    avg = count/len(label)
    return avg

### A note on sessions
To run most of the baselines code, we need to explicitly state that the session is the default one, i.e. start with
    <code here>
    with sess.as_default():
    </code here>
The code is currently set up for initializing sess = U.single_threaded_session() as a global variable and closing/reseting the graph explicitly to enable restarts, etc. Note that U.reset() must be used along with tf.reset_default_graph()

### Functions to load graphs and sessions

In [6]:
# utilities
def reset_session_and_graph():
    try:
        sess.close()
    except:
        pass
    tf.reset_default_graph()
    U.reset()
    
def save_session(fp):
    # saves session
    assert fp[-5:] == ".ckpt", "checkpoint name must end with .ckpt"
    saver = tf.train.Saver()
    saver.save(sess, fp)
    
def load_session_and_graph(fp_meta, fp_ckpt):
    saver = tf.train.import_meta_graph(fp_meta)
    saver.restore(sess, fp_ckpt)
    U.load_state(fp_ckpt)

In [8]:
# Train network over a number of repeats
repeats = 3
SAVE_FP = "/tmp/"

for K in [10]:
    for potential in [0.99]:
        names_and_args["K"] = K
        names_and_args["geo_high"] = K-2
        names_and_args["unif_high"] = max(3, K-3)
        names_and_args["potential"] = potential
        rewards = []
        test_rewards = []
        for rep in range(repeats):
            reset_session_and_graph()
            sess = U.single_threaded_session()

            with sess.as_default():
                erdos_env = gym.make(name, **names_and_args)
                pi, oldpi = make_policies(erdos_env.observation_space, erdos_env.action_space, policy_fn)                
                
                pi, info = train(erdos_env, pi, oldpi, names_and_args, num_timesteps=50000, 
                                 test_envs=[])
                rewards.append(info["rewards"])
                
                # save model
                model_fp = "{}model_K{}_potential{}_rep{}.ckpt".format(
                    SAVE_FP, K, potential, rep)
                save_session(model_fp)

        # save results
        with open(SAVE_FP+"rewards_K%02d_potential%f.p"%(K, potential), "wb") as f:
            pickle.dump(rewards, f)

Device mapping:



  result = entry_point.load(False)


********** Iteration 0 ************
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
 -0.001259832 |  -0.006928104 |     0.7764883 | 0.00033481987 |     0.6928104
-0.0151075795 | -0.0068944776 |    0.36004236 |   0.003697483 |    0.68944776
 -0.018646002 | -0.0068140132 |    0.18862972 |  0.0118688475 |     0.6814013
 -0.023204025 | -0.0067879967 |    0.10339517 |   0.014519679 |     0.6787997
Evaluating losses...
   -0.0297212 |   -0.00679687 |     0.0780719 |     0.0135486 |      0.679687
------------------------------
| EpLenMean       | 9.4      |
| EpRewMean       | -1       |
| EpThisIter      | 10       |
| EpisodesSoFar   | 10       |
| TimeElapsed     | 0.399    |
| TimestepsSoFar  | 94       |
| ev_tdlam_before | -2.98    |
| loss_ent        | 0.68     |
| loss_kl         | 0.0135   |
| loss_pol_entpen | -0.0068  |
| loss_pol_surr   | -0.0297  |
| loss_vf_loss    | 0.0781   |
------------------------------
********** Iteration 1 ****

KeyboardInterrupt: 