In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import torch
from scipy import stats
from lib.DataManager import *
from lib.PolicyStats import *
import os
import cma
from cma.constraints_handler import AugmentedLagrangian, PopulationEvaluator
from IPython import display
import pickle

# Constants

In [3]:
#---SET PARAMS---
USE_GRIDWORLD = True
USE_PDIS = True
num_train_intervals = 10
percent_increase = 0.01

num_states = 18
if(USE_GRIDWORLD):
    num_states = 23
num_actions = 4
gamma = 0.95
delta = 0.01 #1 - delta, confidence

In [4]:
path = "data\data.csv"
if(USE_GRIDWORLD):
    path = "data\gridworld_data.csv"
    path = "data\gridworld_data_0.6097323533319994.csv"

histories = GetHistories(path, gamma)

line 0
line 1000000
line 2000000
line 3000000


In [5]:
avg_exploratory_J = 0
for traj in histories:
    avg_exploratory_J += traj["return"]
    
avg_exploratory_J /= len(histories)
print("Average Baseline Return : " + str(avg_exploratory_J))

Average Baseline Return : 1.1918778099059908


***Set Target***

In [6]:
target_performance = 1.41537
if(USE_GRIDWORLD):
    target_performance = avg_exploratory_J
    
target_performance += abs(target_performance)*percent_increase #% increase
print("Target Performance : " + str(target_performance))

Target Performance : 1.2037965880050507


***Split Data***

In [7]:
split_idx = int(len(histories) * .8)
train = histories[:split_idx]
test = histories[split_idx:]
print(len(train))
print(len(test))

80000
20000


***Get Exploration Policy***

In [8]:
exploration_policy = GetPolicy(train, num_states, num_actions, 1000)
print(exploration_policy)

[[0.11093975 0.67355548 0.08406882 0.13143595]
 [0.33845079 0.11190407 0.4565064  0.09313874]
 [0.08092026 0.47658339 0.38928016 0.05321619]
 [0.26997617 0.16391482 0.4399108  0.12619821]
 [0.40491451 0.27070029 0.13888351 0.18550169]
 [0.26437797 0.24798767 0.09878455 0.38884981]
 [0.28373503 0.60533749 0.09821795 0.01270952]
 [0.08435094 0.5817363  0.24638354 0.08752922]
 [0.05501383 0.11421107 0.79897272 0.03180237]
 [0.19865733 0.05400194 0.35845996 0.38888077]
 [0.05544299 0.12379174 0.08442272 0.73634255]
 [0.47737619 0.05951284 0.03469548 0.4284155 ]
 [0.07710719 0.50252871 0.18859377 0.23177033]
 [0.15065158 0.10532256 0.52729108 0.21673478]
 [0.16824125 0.03899513 0.06917945 0.72358417]
 [0.32641138 0.24798224 0.19835508 0.2272513 ]
 [0.10837872 0.51283487 0.29642909 0.08235732]
 [0.13401538 0.135192   0.4548163  0.27597632]
 [0.12690596 0.25388157 0.33273322 0.28647925]
 [0.34659794 0.07205988 0.44071458 0.14062761]
 [0.11738858 0.18007116 0.12818093 0.57435933]
 [0.40229665 

***Pick Importance Sampling Function***

In [9]:
ISFunc = ImportanceSampling
if(USE_PDIS):
    ISFunc = PDImportanceSampling

***Evaluate Current Policy On Candidate/Safety Data***

In [10]:
# P(Je > J_lower_bound) > 1 - delta
J_bl_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, exploration_policy, ISFunc, delta, len(test))
print("Predicted Baseline: " + str(J_bl_predicted_lower_bound))
J_bl_safety_lower_bound = Safety_Test(test, exploration_policy, gamma, exploration_policy, ISFunc, delta)
print("Safety Baseline: " + str(J_bl_safety_lower_bound))

# Ensures Lower Bound Is Lower, Otherwise Investigate
bl_pred_looseness = avg_exploratory_J - J_bl_predicted_lower_bound
bl_safety_looseness = avg_exploratory_J - J_bl_safety_lower_bound
print("---distance of average return from lower bounds ---")
print("Looseness Of Prediction : " + str(bl_pred_looseness))
print("Looseness Of Safety : " + str(bl_safety_looseness))

if((bl_pred_looseness < 0) or (bl_safety_looseness < 0)):
    raise Exception("Lower Bound Greater Than Average Return!")
    

Predicted Baseline: 1.1449347893199329
Safety Baseline: 1.1555727027611657
---distance of average return from lower bounds ---
Looseness Of Prediction : 0.046943020586057926
Looseness Of Safety : 0.036305107144825044


***Helper Functions***

In [11]:
def policy_softmax(policy):
    numerators = np.exp(policy)
    return (numerators.T / np.sum(numerators, axis=1)).T

In [12]:
#This sucks, never use it lol
def random_explore():
    best_policy = exploration_policy.copy()
    max_lower_bound = 0

    for i in range(100):
        random_step = np.random.normal(0, 1, best_policy.shape)
        new_policy = policy_softmax(best_policy + random_step)

        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test))
        print("Predicted Lower Bound: ", J_predicted_lower_bound)
        if(J_predicted_lower_bound > max_lower_bound):
            print("Policy Updated")
            best_policy = new_policy
            max_lower_bound = J_predicted_lower_bound
        print("---------------")

    print(best_policy)
    return best_policy

In [13]:
# This also sucks, use the constrained variant
def unconstrained_explore():
    def objective(s):
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        print(avgIS)
        return - avgIS #minimizing
    
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while not es.stop():
        solutions = es.ask()
        display.clear_output(True)
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [objective(s) for s in solutions])
        
    return policy_softmax(es.ask()[0].reshape(num_states, num_actions))

In [14]:
def inv_barrier_constrained_explore(lower_bound_goal, max_updates=100):
    #Helper Functions
    #This constraint makes sure our results are passing the safety prediction test
    def constraint(new_policy, avgIS):
        EPSILON = 0.001 #determines penalty for failing lower bound test
        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test), avgIS)
        return 1 / (max(J_predicted_lower_bound - lower_bound_goal, EPSILON))
    
    #This objective results in maximizing the average importance sampling
    def objective(new_policy, avgIS):
        return - avgIS #minimizing
    
    def optimizing_function(s):
        #softmax generated policy
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        
        #caches the averageIS so we don't have to recompute
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        
        #computes score from the objective and constraint
        objective_score = objective(new_policy, avgIS)
        constraint_score = constraint(new_policy, avgIS)
        score = objective_score + constraint_score
        print("score : " + str(score) + "\n---constraint_score : " + str(constraint_score) + "\n---objective_score : " + str(objective_score))
        return score
    
    i = 0
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while (not es.stop() and i != max_updates):
        solutions = es.ask()
        display.clear_output(True)
        print("Update : " + str(i))
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [optimizing_function(s) for s in solutions])
        i += 1
        
    return es

***Explore Policies***

In [None]:
trained_es = inv_barrier_constrained_explore(target_performance, num_train_intervals)
new_policy = policy_softmax(trained_es.ask()[0].reshape(num_states, num_actions))

Update : 8
[[0.16486415 0.48444028 0.19319869 0.15749688]
 [0.16231031 0.70364011 0.04623684 0.08781273]
 [0.13514506 0.12581907 0.59632964 0.14270624]
 [0.11905269 0.19701756 0.47018401 0.21374574]
 [0.06289964 0.1074163  0.29499863 0.53468543]
 [0.15074364 0.77778623 0.02605973 0.04541039]
 [0.45429758 0.14892127 0.10020478 0.29657638]
 [0.05742512 0.62951193 0.22523862 0.08782433]
 [0.36930494 0.11628769 0.45917816 0.05522921]
 [0.12307883 0.38298684 0.37478353 0.1191508 ]
 [0.13662095 0.24051489 0.22358594 0.39927822]
 [0.41671289 0.13322671 0.19127511 0.25878529]
 [0.27104674 0.18112986 0.13028686 0.41753653]
 [0.07054639 0.09590947 0.81440752 0.01913662]
 [0.50120558 0.09408572 0.28427026 0.12043844]
 [0.24141672 0.27734899 0.12304342 0.35819088]
 [0.21500863 0.22932718 0.09883185 0.45683235]
 [0.18423939 0.09186722 0.56712498 0.15676841]
 [0.17358502 0.71634317 0.06651918 0.04355263]
 [0.38994248 0.02895826 0.31395276 0.2671465 ]
 [0.11876784 0.35236806 0.18134047 0.34752362]
 [

In [None]:
print("ES_Convergence : " + str(sum(trained_es.mean**2)))

***Final Results***

In [None]:
J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test))
print("Predicted : " + str(J_predicted_lower_bound))
J_safety_lower_bound = Safety_Test(test, exploration_policy, gamma, new_policy, ISFunc, delta)
print("Safety : "  + str(J_safety_lower_bound))

In [None]:
folder_name = "policies\\delta_" + str(delta) + "\\"
if(USE_GRIDWORLD):
    folder_name = "policies\\gw\\delta_" + str(delta) + "\\"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

np.save(folder_name + "safety_" + str(J_safety_lower_bound), new_policy)

In [None]:
greed_policy = np.argmax(new_policy,axis=1)

if(USE_GRIDWORLD):
    print(np.insert(greed_policy, [12,16],[-1,-1]).reshape((5,5)))
else:
    print(greed_policy.reshape(6,-1))

In [None]:
pickle_name = "pickles\\saved-cma-" + ("gw" if USE_GRIDWORLD else "van") + "-" + ("pdis" if USE_PDIS else "is") + "-" + str(num_train_intervals) + ".pkl"
pickle.dump(trained_es, open(pickle_name, "wb"))
es = pickle.load(open(pickle_name, "rb"))
print("ES_Convergence : " + str(sum(es.mean**2)))