In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import numpy as np
import torch
from scipy import stats
from lib.DataManager import *
from lib.PolicyStats import *
import os
import cma
from cma.constraints_handler import AugmentedLagrangian, PopulationEvaluator
from IPython import display
import pickle

# Constants

In [13]:
#---SET PARAMS---
USE_GRIDWORLD = True
USE_PDIS = True
num_train_intervals = 40
percent_increase = 0.1

num_states = 18
if(USE_GRIDWORLD):
    num_states = 23
num_actions = 4
gamma = 0.95
delta = 0.01 #1 - delta, confidence

In [4]:
path = "data\data.csv"
if(USE_GRIDWORLD):
    path = "data\gridworld_data.csv"

histories = GetHistories(path, gamma)

line 0
line 1000000
line 2000000
line 3000000
line 4000000
line 5000000
line 6000000
line 7000000
-0.4547553405083997


In [7]:
avg_exploratory_J = GetAverageReturn(histories)

Average Baseline Return : -0.9422074904303682


***Set Target***

In [8]:
target_performance = GetTargetPerformance(USE_GRIDWORLD, avg_exploratory_J, percent_increase)

Target Performance : -0.8479867413873314


***Split Data***

In [9]:
train, test = SplitData(histories)

80000
20000


***Get Exploration Policy***

In [10]:
exploration_policy = GetPolicy(train, num_states, num_actions, 1000)
print(exploration_policy)

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


***Pick Importance Sampling Function***

In [14]:
ISFunc = ImportanceSampling
if(USE_PDIS):
    ISFunc = PDImportanceSampling

***Evaluate Current Policy On Candidate/Safety Data***

In [15]:
ConfirmBounds(True, avg_exploratory_J, train, test, exploration_policy, gamma, exploration_policy, ISFunc, delta)

Value: -0.9422074904303682
Predicted Baseline: -1.182750951767137
Safety Baseline: 0.4322671551878296
---distance of return from lower bounds ---
Looseness Of Prediction : 0.24054346133676885
Looseness Of Safety : -1.3744746456181978


Exception: Lower Bound Greater Than Average Return!

***Helper Functions***

In [11]:
def policy_softmax(policy):
    numerators = np.exp(policy)
    return (numerators.T / np.sum(numerators, axis=1)).T

In [12]:
#This sucks, never use it lol
def random_explore():
    best_policy = exploration_policy.copy()
    max_lower_bound = 0

    for i in range(100):
        random_step = np.random.normal(0, 1, best_policy.shape)
        new_policy = policy_softmax(best_policy + random_step)

        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test))
        print("Predicted Lower Bound: ", J_predicted_lower_bound)
        if(J_predicted_lower_bound > max_lower_bound):
            print("Policy Updated")
            best_policy = new_policy
            max_lower_bound = J_predicted_lower_bound
        print("---------------")

    print(best_policy)
    return best_policy

In [13]:
# This also sucks, use the constrained variant
def unconstrained_explore():
    def objective(s):
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        print(avgIS)
        return - avgIS #minimizing
    
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while not es.stop():
        solutions = es.ask()
        display.clear_output(True)
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [objective(s) for s in solutions])
        
    return policy_softmax(es.ask()[0].reshape(num_states, num_actions))

In [14]:
def inv_barrier_constrained_explore(lower_bound_goal, max_updates=10):
    #Helper Functions
    #This constraint makes sure our results are passing the safety prediction test
    def constraint(new_policy, avgIS):
        EPSILON = 0.001 #determines penalty for failing lower bound test
        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test), avgIS)
        return 1 / (max(J_predicted_lower_bound - lower_bound_goal, EPSILON)) #TODO validate constraint
    
    #This objective results in maximizing the average importance sampling
    def objective(new_policy, avgIS):
        return - avgIS #minimizing
    
    def optimizing_function(s):
        #softmax generated policy
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        
        #caches the averageIS so we don't have to recompute
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        
        #computes score from the objective and constraint
        objective_score = objective(new_policy, avgIS)
        constraint_score = constraint(new_policy, avgIS)
        score = objective_score + constraint_score
        print("score : " + str(score) + "\n---constraint_score : " + str(constraint_score) + "\n---objective_score : " + str(objective_score))
        return score
    
    i = 0
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while (not es.stop() and i != max_updates):
        solutions = es.ask()
        display.clear_output(True)
        print("Update : " + str(i))
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [optimizing_function(s) for s in solutions])
        i += 1
        
    return es

***Explore Policies***

In [15]:
trained_es = inv_barrier_constrained_explore(target_performance, num_train_intervals)
new_policy = policy_softmax(trained_es.ask()[0].reshape(num_states, num_actions))

Update : 39
[[0.1411552  0.32763659 0.14267479 0.38853341]
 [0.29040004 0.19519507 0.37528888 0.13911602]
 [0.18865509 0.13448424 0.47577341 0.20108727]
 [0.09621078 0.13973157 0.72538069 0.03867697]
 [0.10482291 0.02555334 0.29842402 0.57119974]
 [0.06412993 0.55789676 0.23723216 0.14074116]
 [0.005124   0.93733947 0.02520555 0.03233098]
 [0.08643774 0.43526381 0.382331   0.09596746]
 [0.04675187 0.05109971 0.77962925 0.12251917]
 [0.15201966 0.22511192 0.32056988 0.30229854]
 [0.74563201 0.04768682 0.07138929 0.13529189]
 [0.0703789  0.26940693 0.05041009 0.60980408]
 [0.24291501 0.25638222 0.23439284 0.26630993]
 [0.20042386 0.08997213 0.64603063 0.06357338]
 [0.27018119 0.32188015 0.02036831 0.38757035]
 [0.93287011 0.05237133 0.00325309 0.01150548]
 [0.09475624 0.15403891 0.22790837 0.52329648]
 [0.00665665 0.41395282 0.46927094 0.11011958]
 [0.04969172 0.10690393 0.8090994  0.03430495]
 [0.57109061 0.00496354 0.06286685 0.36107901]
 [0.06113076 0.02622561 0.7271175  0.18552614]
 

In [16]:
print("ES_Convergence : " + str(sum(trained_es.mean**2)))

ES_Convergence : 150.313497890791


***Final Results***

In [17]:
ConfirmBounds(False, target_performance, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)

Value: -0.8479867413873314
Predicted Baseline: -0.6104487121582425
Safety Baseline: 0.4380582971385134
---distance of return from lower bounds ---
Looseness Of Prediction : 0.23753802922908895
Looseness Of Safety : 1.2860450385258448
-------------------------------------------


In [18]:
J_safety_lower_bound = Safety_Test(test, exploration_policy, gamma, new_policy, ISFunc, delta)
SavePolicy(new_policy, J_safety_lower_bound, delta, USE_GRIDWORLD)

In [19]:
greed_policy = np.argmax(new_policy,axis=1)

if(USE_GRIDWORLD):
    print(np.insert(greed_policy, [12,16],[-1,-1]).reshape((5,5)))
else:
    print(greed_policy.reshape(6,-1))

[[ 1  0  2  2  3]
 [ 1  1  1  3  3]
 [ 0  1 -1  2  2]
 [ 1  0 -1  3  2]
 [ 2  0  2  1  0]]


In [20]:
pickle_name = "pickles\\saved-cma-" + ("gw" if USE_GRIDWORLD else "van") + "-" + ("pdis" if USE_PDIS else "is") + "-" + str(num_train_intervals) + ".pkl"
pickle.dump(trained_es, open(pickle_name, "wb"))
es = pickle.load(open(pickle_name, "rb"))
print("ES_Convergence : " + str(sum(es.mean**2)))

ES_Convergence : 150.313497890791
