In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import torch
from scipy import stats
from lib.DataManager import *
from lib.PolicyStats import *
import os
import cma
from cma.constraints_handler import AugmentedLagrangian, PopulationEvaluator
from IPython import display
import pickle

# Constants

In [4]:
#---SET PARAMS---
USE_GRIDWORLD = False
USE_PDIS = True
num_train_intervals = 50
percent_increase = 0.1

num_states = 18
if(USE_GRIDWORLD):
    num_states = 23
num_actions = 4
gamma = 0.95
delta = 0.01 #1 - delta, confidence

In [5]:
path = "data\data.csv"
if(USE_GRIDWORLD):
    path = "data\gridworld_data.csv"

histories = GetHistories(path, gamma)

line 0
line 1000000
line 2000000
line 3000000
line 4000000
line 5000000
line 6000000
line 7000000
line 8000000
line 9000000
line 10000000
line 11000000
line 12000000
line 13000000
line 14000000
line 15000000
line 16000000
line 17000000
line 18000000
line 19000000
line 20000000
line 21000000
line 22000000
line 23000000
line 24000000
line 25000000
line 26000000
line 27000000
line 28000000
line 29000000
line 30000000
line 31000000
line 32000000
line 33000000
line 34000000
line 35000000
line 36000000
line 37000000
line 38000000
line 39000000
line 40000000
line 41000000
line 42000000
line 43000000
line 44000000
line 45000000
line 46000000
line 47000000
line 48000000
line 49000000
line 50000000
line 51000000
line 52000000
line 53000000
line 54000000
line 55000000
line 56000000
line 57000000
line 58000000
line 59000000
line 60000000
line 61000000
line 62000000


In [6]:
avg_exploratory_J = GetAverageReturn(histories)

Average Baseline Return : 1.172600237817154


***Set Target***

In [7]:
target_performance = GetTargetPerformance(USE_GRIDWORLD, avg_exploratory_J, percent_increase)

Target Performance : 1.556907


***Split Data***

In [8]:
train, test = SplitData(histories)

800000
200000


***Get Exploration Policy***

In [9]:
exploration_policy = GetPolicy(train, num_states, num_actions, 1000)
print(exploration_policy)

[[0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.    0.    0.    0.   ]
 [0.25  0.25  0.25  0.25 ]
 [0.375 0.375 0.125 0.125]]


In [10]:
np.save("exploration_policy.npy", exploration_policy)

***Pick Importance Sampling Function***

In [9]:
ISFunc = ImportanceSampling
if(USE_PDIS):
    ISFunc = PDImportanceSampling

***Evaluate Current Policy On Candidate/Safety Data***

In [10]:
ConfirmBounds(True, avg_exploratory_J, train, test, exploration_policy, gamma, exploration_policy, ISFunc, delta)

Value: 1.172600237817154
Predicted Baseline: 1.1556262108490003
Safety Baseline: 1.1682832178181233
---distance of return from lower bounds ---
Looseness Of Prediction : 0.016974026968153755
Looseness Of Safety : 0.00431701999903078
-------------------------------------------


***Helper Functions***

In [11]:
def policy_softmax(policy):
    numerators = np.exp(policy)
    return (numerators.T / np.sum(numerators, axis=1)).T

In [12]:
#This sucks, never use it lol
def random_explore():
    best_policy = exploration_policy.copy()
    max_lower_bound = 0

    for i in range(100):
        random_step = np.random.normal(0, 1, best_policy.shape)
        new_policy = policy_softmax(best_policy + random_step)

        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test))
        print("Predicted Lower Bound: ", J_predicted_lower_bound)
        if(J_predicted_lower_bound > max_lower_bound):
            print("Policy Updated")
            best_policy = new_policy
            max_lower_bound = J_predicted_lower_bound
        print("---------------")

    print(best_policy)
    return best_policy

In [13]:
# This also sucks, use the constrained variant
def unconstrained_explore():
    def objective(s):
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        print(avgIS)
        return - avgIS #minimizing
    
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while not es.stop():
        solutions = es.ask()
        display.clear_output(True)
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [objective(s) for s in solutions])
        
    return policy_softmax(es.ask()[0].reshape(num_states, num_actions))

In [14]:
def inv_barrier_constrained_explore(lower_bound_goal, max_updates=10):
    #Helper Functions
    #This constraint makes sure our results are passing the safety prediction test
    def constraint(new_policy, avgIS):
        EPSILON = 0.001 #determines penalty for failing lower bound test
        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test), avgIS)
        return 1 / (max(J_predicted_lower_bound - lower_bound_goal, EPSILON)) #TODO validate constraint
    
    #This objective results in maximizing the average importance sampling
    def objective(new_policy, avgIS):
        return - avgIS #minimizing
    
    def optimizing_function(s):
        #softmax generated policy
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        
        #caches the averageIS so we don't have to recompute
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        
        #computes score from the objective and constraint
        objective_score = objective(new_policy, avgIS)
        constraint_score = constraint(new_policy, avgIS)
        score = objective_score + constraint_score
        print("score : " + str(score) + "\n---constraint_score : " + str(constraint_score) + "\n---objective_score : " + str(objective_score))
        return score
    
    i = 0
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while (not es.stop() and i != max_updates):
        solutions = es.ask()
        display.clear_output(True)
        print("Update : " + str(i))
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [optimizing_function(s) for s in solutions])
        i += 1
        
    return es

***Explore Policies***

In [None]:
trained_es = inv_barrier_constrained_explore(target_performance, num_train_intervals)
new_policy = policy_softmax(trained_es.ask()[0].reshape(num_states, num_actions))

Update : 0
[[0.11873524 0.30342526 0.2169124  0.3609271 ]
 [0.31807093 0.30133512 0.2793935  0.10120045]
 [0.15917271 0.4099914  0.31147731 0.11935858]
 [0.1661041  0.09644574 0.12767748 0.60977268]
 [0.18131523 0.24420533 0.46135757 0.11312187]
 [0.33011637 0.30042842 0.15815074 0.21130447]
 [0.23128722 0.11813172 0.32221375 0.32836731]
 [0.29721434 0.14594431 0.40024104 0.15660031]
 [0.14084254 0.14172576 0.31364237 0.40378932]
 [0.24262718 0.27439315 0.36441419 0.11856547]
 [0.12472247 0.39374767 0.12332225 0.35820762]
 [0.58766564 0.10989728 0.1191396  0.18329749]
 [0.17372856 0.29294101 0.35486571 0.17846472]
 [0.26003114 0.15172827 0.44539932 0.14284127]
 [0.19692554 0.28900267 0.29847441 0.21559738]
 [0.21403272 0.40145988 0.25299075 0.13151665]
 [0.18362686 0.25810009 0.37643507 0.18183798]
 [0.33668036 0.14095224 0.3328288  0.1895386 ]]
score : 999.0325050619156
---constraint_score : 1000.0
---objective_score : -0.9674949380844687


In [None]:
print("ES_Convergence : " + str(sum(trained_es.mean**2)))

***Final Results***

In [None]:
ConfirmBounds(False, target_performance, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)

In [None]:
J_safety_lower_bound = Safety_Test(test, exploration_policy, gamma, new_policy, ISFunc, delta)
SavePolicy(new_policy, J_safety_lower_bound, delta, USE_GRIDWORLD)

In [None]:
greed_policy = np.argmax(new_policy,axis=1)

if(USE_GRIDWORLD):
    print(np.insert(greed_policy, [12,16],[-1,-1]).reshape((5,5)))
else:
    print(greed_policy.reshape(6,-1))

In [None]:
pickle_name = "pickles\\saved-cma-" + ("gw" if USE_GRIDWORLD else "van") + "-" + ("pdis" if USE_PDIS else "is") + "-" + str(num_train_intervals) + ".pkl"
pickle.dump(trained_es, open(pickle_name, "wb"))
es = pickle.load(open(pickle_name, "rb"))
print("ES_Convergence : " + str(sum(es.mean**2)))