In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import torch
from scipy import stats
from lib.DataManager import *
from lib.PolicyStats import *
from lib.Gridworld import *
import os
import cma
from cma.constraints_handler import AugmentedLagrangian, PopulationEvaluator
from IPython import display
import pickle

In [3]:
#---SET PARAMS---
USE_GRIDWORLD = True
USE_PDIS = True
percent_increase = 0.1
num_policies = 100

num_states = 18
if(USE_GRIDWORLD):
    num_states = 23
num_actions = 4
gamma = 0.95
delta = 0.01 #1 - delta, confidence

In [4]:
path = "data\data.csv"
if(USE_GRIDWORLD):
    path = "data\gridworld_data.csv"

histories = GetHistories(path, gamma)

line 0
line 1000000
line 2000000
line 3000000
line 4000000
line 5000000
line 6000000
line 7000000


In [5]:
train, test = SplitData(histories)

80000
20000


In [6]:
avg_exploratory_J = GetAverageReturn(histories)

Average Baseline Return : -0.9422074904303682


In [7]:
target_performance = GetTargetPerformance(USE_GRIDWORLD, avg_exploratory_J, percent_increase)

Target Performance : -0.8479867413873314


In [8]:
exploration_policy = GetPolicy(train, num_states, num_actions, 1000)
print(exploration_policy)

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


In [9]:
ISFunc = ImportanceSampling
if(USE_PDIS):
    ISFunc = PDImportanceSampling

In [10]:
es = pickle.load(open("pickles//saved-cma-gw-pdis-10.pkl", "rb"))
print("ES_Convergence : " + str(sum(es.mean**2)))

ES_Convergence : 51.789221010893534


In [11]:
def policy_softmax(policy):
    numerators = np.exp(policy)
    return (numerators.T / np.sum(numerators, axis=1)).T

In [None]:
avg_returns = 0
new_policy = None
theoretical_fails = 0
actual_fails = 0
i = 0
while i < num_policies:
    passed_theoretical = False
    try:
        print("~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~")
        print(i)
        solution = es.ask(1)[0].reshape(num_states, num_actions)
        new_policy = policy_softmax(solution)
        ConfirmBounds(False, target_performance, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)
        passed_theoretical = True
        #policy passes theoretic check

        actual_J = GetGridworldReturn(new_policy, gamma, 100000)

        ConfirmBounds(True, actual_J, train, test, exploration_policy, gamma, new_policy, ISFunc, delta)
        #policy passes actual check

        avg_returns += actual_J
        i += 1
    except:
        print("___F A I L___")
        if(passed_theoretical):
            actual_fails += 1
        else:
            theoretical_fails += 1

avg_returns /= num_policies
print("Number of Actual Fails : " + str(actual_fails))
print("Number of Theoretical Fails : " + str(theoretical_fails))
print("Average Return : " + str(avg_returns))
print("Target : " + str(target_performance))
print("Confidence : " + str(delta))

~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~
0
Value: -0.8479867413873314
Predicted Baseline: -0.07994074900435577
Safety Baseline: 0.28177145971321677
---distance of return from lower bounds ---
Looseness Of Prediction : 0.7680459923829757
Looseness Of Safety : 1.1297582011005483
-------------------------------------------
Value: 0.720503259319134
Predicted Baseline: -0.07994074900435577
Safety Baseline: 0.28177145971321677
---distance of return from lower bounds ---
Looseness Of Prediction : 0.8004440083234897
Looseness Of Safety : 0.4387317996059172
-------------------------------------------
~x~~x~~~x~~~~~x~~~~~x~~~x~~~~~x~~~~x~~
1
Value: -0.8479867413873314
Predicted Baseline: -0.6931744427649585
Safety Baseline: 0.07241729651554352
---distance of return from lower bounds ---
Looseness Of Prediction : 0.1548122986223729
Looseness Of Safety : 0.920404037902875
-------------------------------------------
Value: 0.678129281388419
Predicted Baseline: -0.6931744427649585
Safety Baseline: 0.0

In [None]:
greed_policy = np.argmax(new_policy,axis=1)

if(USE_GRIDWORLD):
    print(np.insert(greed_policy, [12,16],[-1,-1]).reshape((5,5)))
else:
    print(greed_policy.reshape(6,-1))