In [1]:
from optparse import OptionParser

import random
import pandas
import numpy

In [2]:
import matplotlib.pyplot as plt

In [3]:
import gym
import gym_hw

In [4]:
env = gym.make('gridworld_hw-v0')
env.verbose = False # no graphics
LIFETIME = 10**7
num_states = 6
num_actions = 4

In [5]:
def observation_to_state(obs):
    return obs
def fromcolrow(r, c):
    if r == 0 and c == 0:
        return 0
    if r == 0 and c == 1:
        return 1
    if r == 1 and c == 0:
        return 2 
    if r == 1 and c == 1:
        return 3
    if r == 2 and c == 0:
        return 4
    if r == 2 and c == 1:
        return 5
    return None
def convert(a):
    return a

# Review equivalent to a safe controller action choice

In [27]:
def review(state, a): 
    if state == 2 and a == 1: 
        return False 
    return True

In [6]:
def epsilon_greedy(S, Q, epsilon):
    if numpy.random.random() < epsilon:
        return numpy.random.choice(num_actions)
    return numpy.argmax(Q[S])

In [39]:
def q_learning(env, obs_to_state, alpha, gamma, epsilon, initial_q, lifetime=LIFETIME):
    Q = numpy.full((num_states, num_actions), float(initial_q)) # Initialize Q(s,a)
    t = 0
    observation = env.reset()
    G = initializeKb()
    total_reward = 0 # Initialize total reward
    state = obs_to_state(observation) # Initiliaze S
    while (t < lifetime): # Stopping criterion
        a = epsilon_greedy(state, Q, epsilon)
        #if (not review(state, a)):
        #    continue
        if (believesBad(G, state, a)):
            continue
        N = env.step(a)
        new_state = obs_to_state(N[0])
        reward = N[1]
        total_reward += reward
        ## Calculating maxa Q(S', a)
        maxQ = -float("inf")
        for action in range(num_actions):
            if Q[new_state, action] > maxQ:
                maxQ = Q[new_state, action]
        Q[state, a] = Q[state, a] + alpha * (reward + gamma*maxQ - Q[state, a])
        state = new_state
        t += 1 # lost one life 
    return total_reward, Q

In [29]:
def plot_policy(Q, algo, pi=None):
    rows = 3
    cols = 2
    pi_arr = numpy.full((rows, cols), 0)
    for r in range(rows):
        for c in range(cols):
            Qmax = -float("inf")
            Amax = None
            state = fromcolrow(r, c)
            for a in range(num_actions):
                if Q[state, a] > Qmax:
                    Qmax = Q[state, a]
                    Amax = convert(a)
            pi_arr[r, c] = Amax
    f, _ = plt.subplots()
    plt.pcolormesh(pi_arr, cmap=plt.cm.get_cmap('viridis', 4))
    plt.colorbar(ticks=range(4), label="action")
    plt.xticks(range(cols))
    plt.yticks(range(rows))
    plt.title(algo + " Policy")
    plt.savefig("policy.png")

In [40]:
def plot_cumulative_reward(env, obs_to_state):
    exps = [("Q-Learning",0.1,0.5,0.2,0)]
    fig = plt.figure()
    ax = plt.axes()
    plt.title("Cumulative Reward given Lifetime")
    plt.xlabel("Lifetime")
    plt.ylabel("Cumulative Reward")
    for exp in exps:
        alpha = exp[1]
        gamma = exp[2]
        epsilon = exp[3]
        initial_q = exp[4]
        x = []
        for i in range(19):
            x += [2**i]
        y = []
        for life in x:
            r, Q = q_learning(env, obs_to_state, alpha, gamma, epsilon, initial_q, life)
            y += [r]
        plt.plot(x, y, label = exp[0] + "- alpha: " + str(alpha) + " gamma: " + str(gamma) + " epsilon: "+ str(epsilon))
    #plot_policy(Q, "Q-Learning with gamma = 0.1")
    plt.legend(loc="lower right")
    fig.savefig("cumulative+belief.png", facecolor="white")

In [41]:
plot_cumulative_reward(env, observation_to_state)

Note that when adding a review, the algorithm converges faster [evident in the comparison between the cumulative reward of both implementation]. 

# Knowledge Base as History

In [38]:
def initializeKb():
    return set()
def kb(G, s0, a, r):
    G.add((s0, a, r)) 
    return G 
def believesBad(G, s, a):
    knows = [] 
    for p in G:
        if p[0] == s and p[1] == a:
            knows += [r] 
    if sum(knows) < 0:
        return True
    return False