In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import torch
from scipy import stats
from lib.DataManager import *
from lib.PolicyStats import *
import os
import cma
from cma.constraints_handler import AugmentedLagrangian, PopulationEvaluator
from IPython import display
import pickle

# Constants

In [3]:
#---SET PARAMS---
USE_GRIDWORLD = True
USE_PDIS = False
num_train_intervals = 10
percent_increase = 0.1

num_states = 18
if(USE_GRIDWORLD):
    num_states = 23
num_actions = 4
gamma = 0.95
delta = 0.01 #1 - delta, confidence

In [4]:
path = "data\data.csv"
if(USE_GRIDWORLD):
    path = "data\gridworld_data.csv"

histories = GetHistories(path, gamma)

line 0
line 1000000
line 2000000
line 3000000
line 4000000
line 5000000
line 6000000
line 7000000


In [5]:
avg_exploratory_J = 0
for traj in histories:
    avg_exploratory_J += traj["return"]
    
avg_exploratory_J /= len(histories)
print("Average Baseline Return : " + str(avg_exploratory_J))

Average Baseline Return : -0.9422074904303682


***Set Target***

In [6]:
target_performance = 1.41537
if(USE_GRIDWORLD):
    target_performance = avg_exploratory_J
    
target_performance += abs(target_performance)*percent_increase #% increase
print("Target Performance : " + str(target_performance))

Target Performance : -0.8479867413873314


***Split Data***

In [7]:
split_idx = int(len(histories) * .8)
train = histories[:split_idx]
test = histories[split_idx:]
print(len(train))
print(len(test))

80000
20000


***Get Exploration Policy***

In [8]:
exploration_policy = GetPolicy(train, num_states, num_actions, 1000)
print(exploration_policy)

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


***Pick Importance Sampling Function***

In [9]:
ISFunc = ImportanceSampling
if(USE_PDIS):
    ISFunc = PDImportanceSampling

***Evaluate Current Policy On Candidate/Safety Data***

In [10]:
ConfirmBounds(True, avg_exploratory_J, train, test, exploration_policy, gamma, exploration_policy, ISFunc, delta)

Predicted Baseline: -1.0242265270774407
Safety Baseline: -0.9720581231722768
---distance of return from lower bounds ---
Looseness Of Prediction : 0.08201903664707244
Looseness Of Safety : 0.029850632741908578


***Helper Functions***

In [11]:
def policy_softmax(policy):
    numerators = np.exp(policy)
    return (numerators.T / np.sum(numerators, axis=1)).T

In [12]:
#This sucks, never use it lol
def random_explore():
    best_policy = exploration_policy.copy()
    max_lower_bound = 0

    for i in range(100):
        random_step = np.random.normal(0, 1, best_policy.shape)
        new_policy = policy_softmax(best_policy + random_step)

        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test))
        print("Predicted Lower Bound: ", J_predicted_lower_bound)
        if(J_predicted_lower_bound > max_lower_bound):
            print("Policy Updated")
            best_policy = new_policy
            max_lower_bound = J_predicted_lower_bound
        print("---------------")

    print(best_policy)
    return best_policy

In [13]:
# This also sucks, use the constrained variant
def unconstrained_explore():
    def objective(s):
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        print(avgIS)
        return - avgIS #minimizing
    
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while not es.stop():
        solutions = es.ask()
        display.clear_output(True)
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [objective(s) for s in solutions])
        
    return policy_softmax(es.ask()[0].reshape(num_states, num_actions))

In [14]:
def inv_barrier_constrained_explore(lower_bound_goal, max_updates=10):
    #Helper Functions
    #This constraint makes sure our results are passing the safety prediction test
    def constraint(new_policy, avgIS):
        EPSILON = 0.001 #determines penalty for failing lower bound test
        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test), avgIS)
        return 1 / (max(J_predicted_lower_bound - lower_bound_goal, EPSILON))
    
    #This objective results in maximizing the average importance sampling
    def objective(new_policy, avgIS):
        return - avgIS #minimizing
    
    def optimizing_function(s):
        #softmax generated policy
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        
        #caches the averageIS so we don't have to recompute
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        
        #computes score from the objective and constraint
        objective_score = objective(new_policy, avgIS)
        constraint_score = constraint(new_policy, avgIS)
        score = objective_score + constraint_score
        print("score : " + str(score) + "\n---constraint_score : " + str(constraint_score) + "\n---objective_score : " + str(objective_score))
        return score
    
    i = 0
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while (not es.stop() and i != max_updates):
        solutions = es.ask()
        display.clear_output(True)
        print("Update : " + str(i))
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [optimizing_function(s) for s in solutions])
        i += 1
        
    return es

***Explore Policies***

In [15]:
trained_es = inv_barrier_constrained_explore(target_performance, num_train_intervals)
new_policy = policy_softmax(trained_es.ask()[0].reshape(num_states, num_actions))

Update : 9
[[0.24756525 0.11091496 0.20831065 0.43320913]
 [0.07023039 0.23384795 0.04396644 0.65195522]
 [0.4735708  0.38978473 0.07069771 0.06594676]
 [0.4431821  0.33396297 0.12601004 0.09684489]
 [0.07206047 0.7327267  0.08740366 0.10780917]
 [0.67690333 0.13419453 0.15797351 0.03092863]
 [0.1250434  0.35260408 0.42520694 0.09714558]
 [0.65756555 0.05121622 0.10691529 0.18430294]
 [0.16299782 0.28168684 0.29759491 0.25772044]
 [0.08702494 0.46044608 0.17402696 0.27850201]
 [0.12092118 0.31753348 0.06011329 0.50143206]
 [0.12477153 0.34152171 0.39106637 0.14264039]
 [0.26772484 0.38139972 0.16871681 0.18215863]
 [0.53616967 0.31689048 0.11808321 0.02885664]
 [0.42635483 0.13966308 0.32775126 0.10623083]
 [0.36428052 0.20463437 0.19638667 0.23469844]
 [0.42748939 0.28395662 0.17314548 0.11540851]
 [0.23347382 0.42203264 0.22728923 0.11720431]
 [0.06621269 0.12632012 0.5173068  0.29016039]
 [0.45653394 0.02542966 0.1169286  0.4011078 ]
 [0.48354948 0.23686277 0.11129244 0.16829531]
 [

In [16]:
print("ES_Convergence : " + str(sum(trained_es.mean**2)))

ES_Convergence : 50.569243338700026


***Final Results***

In [21]:
ConfirmBounds(False, target_performance, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)

Value: -0.8479867413873314
Predicted Baseline: -0.1614165548185133
Safety Baseline: 0.014594610851782888
---distance of return from lower bounds ---
Looseness Of Prediction : 0.6865701865688181
Looseness Of Safety : 0.8625813522391144
-------------------------------------------


In [23]:
J_safety_lower_bound = Safety_Test(test, exploration_policy, gamma, new_policy, ISFunc, delta)
SavePolicy(new_policy, J_safety_lower_bound, delta, USE_GRIDWORLD)

In [24]:
greed_policy = np.argmax(new_policy,axis=1)

if(USE_GRIDWORLD):
    print(np.insert(greed_policy, [12,16],[-1,-1]).reshape((5,5)))
else:
    print(greed_policy.reshape(6,-1))

[[ 3  3  1  3  1]
 [ 0  1  0  2  1]
 [ 0  1 -1  1  2]
 [ 2  0 -1  1  1]
 [ 1  0  3  0  3]]


In [None]:
pickle_name = "pickles\\saved-cma-" + ("gw" if USE_GRIDWORLD else "van") + "-" + ("pdis" if USE_PDIS else "is") + "-" + str(num_train_intervals) + ".pkl"
pickle.dump(trained_es, open(pickle_name, "wb"))
es = pickle.load(open(pickle_name, "rb"))
print("ES_Convergence : " + str(sum(es.mean**2)))