In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from scipy import stats
from lib.DataManager import *
from lib.PolicyStats import *
from lib.Gridworld import *
from lib.Mockworld import *
import os
import cma
from cma.constraints_handler import AugmentedLagrangian, PopulationEvaluator
from IPython import display
import pickle
import random

***Set Params***

In [3]:
#---SET PARAMS---
USE_GRIDWORLD = False
USE_PDIS = True
USE_BOTH_PDIS_AND_IS = False # If True, we take policies that pass both tests
percent_increase = 0.1
num_policies = 20
es_path = "pickles/pdis_0.pkl"
gamma = 0.95
delta = 0.01 #1 - delta, confidence
random_seed = 877496444 #NOTE, this seed is for numpy operations. Since we load in CMA-ES from file its seed and state are embedded in the save and constant
sub_dir = "pdis_0\\"
title_offset = 61 # 1, 11, 21, so on

# seeds 508 (is_1.pkl), 1337 (is_0.pkl), 88885555 (pdis_1.pkl), 877496444 (pdis_0.pkl), 3125832 (pdis_2.pkl)
np.random.seed(random_seed) #this is to recreate the same policies

***Setup Environment***

In [4]:
num_states = 18
if(USE_GRIDWORLD):
    print("Using Gridworld")
    num_states = 23
num_actions = 4

In [5]:
path = "data\data.csv"
if(USE_GRIDWORLD):
    path = "data\gridworld_data.csv"

histories = GetHistories(path, gamma)
# random.shuffle(histories)

line 0
line 1000000
line 2000000
line 3000000
line 4000000
line 5000000
line 6000000
line 7000000
line 8000000
line 9000000
line 10000000
line 11000000
line 12000000
line 13000000
line 14000000
line 15000000
line 16000000
line 17000000
line 18000000
line 19000000
line 20000000
line 21000000
line 22000000
line 23000000
line 24000000
line 25000000
line 26000000
line 27000000
line 28000000
line 29000000
line 30000000
line 31000000
line 32000000
line 33000000
line 34000000
line 35000000
line 36000000
line 37000000
line 38000000
line 39000000
line 40000000
line 41000000
line 42000000
line 43000000
line 44000000
line 45000000
line 46000000
line 47000000
line 48000000
line 49000000
line 50000000
line 51000000
line 52000000
line 53000000
line 54000000
line 55000000
line 56000000
line 57000000
line 58000000
line 59000000
line 60000000
line 61000000
line 62000000


In [6]:
train, test = SplitData(histories)

800000
200000


In [7]:
avg_exploratory_J = GetAverageReturn(histories)

Average Baseline Return : 1.172600237817154


In [8]:
target_performance = GetTargetPerformance(USE_GRIDWORLD, avg_exploratory_J, percent_increase)

Target Performance : 1.556907


In [9]:
exploration_policy = GetPolicy(train, num_states, num_actions, 1000)
print(exploration_policy)

[[0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.    0.    0.    0.   ]
 [0.25  0.25  0.25  0.25 ]
 [0.375 0.375 0.125 0.125]]


In [10]:
ISFunc = ImportanceSampling
OtherFunc = PDImportanceSampling
if(USE_PDIS):
    ISFunc = PDImportanceSampling
    OtherFunc = ImportanceSampling

In [11]:
es = pickle.load(open(es_path, "rb"))
print("ES_Convergence : " + str(sum(es.mean**2)))
print(es.opts["seed"])

ES_Convergence : 195.62034246757986
595346


In [12]:
def policy_softmax(policy):
    numerators = np.exp(policy)
    return (numerators.T / np.sum(numerators, axis=1)).T

In [None]:
avg_returns = 0
new_policy = None
theoretical_fails = 0
actual_fails = 0
i = 0
while i < num_policies:
    passed_theoretical = False
    try:
        print("~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~")
        print(i)
        solution = es.ask(1)[0].reshape(num_states, num_actions)
        solution[15,:] = 1
        new_policy = policy_softmax(solution)
        ConfirmBounds(False, target_performance, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)
        if (USE_BOTH_PDIS_AND_IS):
            ConfirmBounds(False, target_performance, train, test, exploration_policy, gamma, new_policy, OtherFunc, delta)
        passed_theoretical = True
        #policy passes theoretic check

        actual_J = -1
        if(USE_GRIDWORLD):
            actual_J = GetGridworldReturn(new_policy, gamma, 100000)
        else:
            actual_J = GetMockworldReturn(new_policy, gamma, 100000)

        print("TEST RUN " + str(actual_J))
        print(actual_J > target_performance)
#         ConfirmBounds(True, actual_J, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)
#         if (USE_BOTH_PDIS_AND_IS):
#             ConfirmBounds(True, actual_J, train, test, exploration_policy, gamma, new_policy, OtherFunc, delta)
        #policy passes actual check
        path = "text_policies\\" + sub_dir + "policy" + str(i + title_offset) + ".txt"
        WriteSolutionToTxt(path, solution)
        loaded_solution = LoadSolutionFromTxt(path, num_states, num_actions)
        assert np.sum(loaded_solution - solution) == 0
        
        avg_returns += actual_J
        i += 1
    except:
        print("___F A I L___")
        if(passed_theoretical):
            actual_fails += 1
        else:
            theoretical_fails += 1

avg_returns /= num_policies
print("Number of Actual Fails : " + str(actual_fails))
print("Number of Theoretical Fails : " + str(theoretical_fails))
print("Average Return : " + str(avg_returns))
print("Target : " + str(target_performance))
print("Confidence : " + str(delta))

~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~
0
Value: 1.556907
Predicted Baseline: 0.15721052273159497
Safety Baseline: 4.9207815007565365
---distance of return from lower bounds ---
Looseness Of Prediction : -1.399696477268405
Looseness Of Safety : 3.3638745007565367
___F A I L___
~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~
0
Value: 1.556907
Predicted Baseline: 3.3713282720046225
Safety Baseline: 6.848367078845024
---distance of return from lower bounds ---
Looseness Of Prediction : 1.8144212720046224
Looseness Of Safety : 5.291460078845025
-------------------------------------------
TEST RUN 6.9997477829890675
True
~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~
1
Value: 1.556907
Predicted Baseline: 2.192140550656097
Safety Baseline: 5.640655663412589
---distance of return from lower bounds ---
Looseness Of Prediction : 0.6352335506560969
Looseness Of Safety : 4.083748663412589
-------------------------------------------
TEST RUN 7.216595202675599
True
~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~
2
Value: 1.5

In [None]:
greed_policy = np.argmax(new_policy,axis=1)

if(USE_GRIDWORLD):
    print(np.insert(greed_policy, [12,16],[-1,-1]).reshape((5,5)))
else:
    print(greed_policy[-2:])
    print(greed_policy[:-2].reshape(4,4))