In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import torch
from scipy import stats
from lib.DataManager import *
from lib.PolicyStats import *
import os
import cma
from cma.constraints_handler import AugmentedLagrangian, PopulationEvaluator
from IPython import display
import pickle
import random

# Constants
### Refer to ReadMe for how to set the seeds and the USE_PDIS parameter and num_train intervals
### Other variables remain the same

In [3]:
#---SET PARAMS---
USE_GRIDWORLD = False
USE_PDIS = False
num_train_intervals = 5
percent_increase = 0.1
gamma = 0.95
delta = 0.01 #1 - delta, confidence
seed = 768362 #see readme for what to set this as

***Loads world and establishes world states***

In [4]:
num_states = 18
if(USE_GRIDWORLD):
    num_states = 23
num_actions = 4

In [None]:
path = "data\data.csv"
if(USE_GRIDWORLD):
    path = "data\gridworld_data.csv"

histories = GetHistories(path, gamma)

line 0
line 1000000
line 2000000
line 3000000
line 4000000
line 5000000
line 6000000
line 7000000
line 8000000
line 9000000
line 10000000
line 11000000
line 12000000
line 13000000
line 14000000
line 15000000
line 16000000
line 17000000
line 18000000
line 19000000
line 20000000
line 21000000
line 22000000
line 23000000
line 24000000
line 25000000


***Data Analysis on Average Return of Exploratory Policy***

In [None]:
avg_exploratory_J = GetAverageReturn(histories)

***Set Target***

In [None]:
target_performance = GetTargetPerformance(USE_GRIDWORLD, avg_exploratory_J, percent_increase)

***Split Data***

In [None]:
train, test = SplitData(histories)

***Get Exploration Policy***

In [None]:
exploration_policy = GetPolicy(train, num_states, num_actions, 1000)
print(exploration_policy)

In [None]:
np.save("exploration_policy.npy", exploration_policy)

***Pick Importance Sampling Function***

In [None]:
ISFunc = ImportanceSampling
if(USE_PDIS):
    ISFunc = PDImportanceSampling

***Evaluate Current Policy On Candidate/Safety Data***

In [None]:
ConfirmBounds(True, avg_exploratory_J, train, test, exploration_policy, gamma, exploration_policy, ISFunc, delta)

***Helper Functions***

In [None]:
def policy_softmax(policy):
    numerators = np.exp(policy)
    return (numerators.T / np.sum(numerators, axis=1)).T

In [None]:
#This sucks, never use it lol
def random_explore():
    best_policy = exploration_policy.copy()
    max_lower_bound = 0

    for i in range(100):
        random_step = np.random.normal(0, 1, best_policy.shape)
        new_policy = policy_softmax(best_policy + random_step)

        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test))
        print("Predicted Lower Bound: ", J_predicted_lower_bound)
        if(J_predicted_lower_bound > max_lower_bound):
            print("Policy Updated")
            best_policy = new_policy
            max_lower_bound = J_predicted_lower_bound
        print("---------------")

    print(best_policy)
    return best_policy

In [None]:
# This also sucks, use the constrained variant
def unconstrained_explore():
    def objective(s):
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        print(avgIS)
        return - avgIS #minimizing
    
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.5)
    while not es.stop():
        solutions = es.ask()
        display.clear_output(True)
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [objective(s) for s in solutions])
        
    return policy_softmax(es.ask()[0].reshape(num_states, num_actions))

In [None]:
def inv_barrier_constrained_explore(lower_bound_goal, max_updates=10):
    #Helper Functions
    #This constraint makes sure our results are passing the safety prediction test
    def constraint(new_policy, avgIS):
        EPSILON = 0.001 #determines penalty for failing lower bound test
        J_predicted_lower_bound = Safety_Prediction(train, exploration_policy, gamma, new_policy, ISFunc, delta, len(test), avgIS)
        return 1 / (max(J_predicted_lower_bound - lower_bound_goal, EPSILON)) #TODO validate constraint
    
    #This objective results in maximizing the average importance sampling
    def objective(new_policy, avgIS):
        return - avgIS #minimizing
    
    def optimizing_function(s):
        #softmax generated policy
        new_policy = policy_softmax(s.reshape(num_states, num_actions))
        
        #caches the averageIS so we don't have to recompute
        avgIS = CalcAvgIS(train, exploration_policy, gamma, new_policy, ISFunc)
        
        #computes score from the objective and constraint
        objective_score = objective(new_policy, avgIS)
        constraint_score = constraint(new_policy, avgIS)
        score = objective_score + constraint_score
        print("score : " + str(score) + "\n---constraint_score : " + str(constraint_score) + "\n---objective_score : " + str(objective_score))
        return score
    
    i = 0
#     es = trained_es
    es = cma.CMAEvolutionStrategy(num_states * num_actions * [0], 0.8, {'seed':seed})
    while (not es.stop() and i != max_updates):
        solutions = es.ask()
        display.clear_output(True)
        print("Update : " + str(i))
        print(policy_softmax(solutions[0].reshape(num_states, num_actions)))
        es.tell(solutions, [optimizing_function(s) for s in solutions])
        i += 1
        
    return es

***Explore Policies***

In [None]:
trained_es = inv_barrier_constrained_explore(target_performance, num_train_intervals)

In [None]:
pickle_name = "pickles\\is_1-cma-" + ("gw" if USE_GRIDWORLD else "van") + "-" + ("pdis" if USE_PDIS else "is") + "-" + str(num_train_intervals) + ".pkl"
pickle.dump(trained_es, open(pickle_name, "wb"))
es = pickle.load(open(pickle_name, "rb"))
print("ES_Convergence : " + str(sum(es.mean**2)))

In [None]:
new_policy = policy_softmax(trained_es.ask()[0].reshape(num_states, num_actions))

In [None]:
print("ES_Convergence : " + str(sum(trained_es.mean**2)))

***Final Results***

In [None]:
ConfirmBounds(False, target_performance, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)

In [None]:
J_safety_lower_bound = Safety_Test(test, exploration_policy, gamma, new_policy, ISFunc, delta)
SaveNumpyPolicy(new_policy, J_safety_lower_bound, delta, USE_GRIDWORLD)

In [None]:
greed_policy = np.argmax(new_policy,axis=1)

if(USE_GRIDWORLD):
    print(np.insert(greed_policy, [12,16],[-1,-1]).reshape((5,5)))
else:
    print(greed_policy[:-2].reshape(4,4))