# Tabular Q-Learning in Python
Code for the 4x4 configuration, with incrementality.

In [315]:
import numpy as np
import random

In [316]:
num_episodes = 30000
eta = 0.8
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.0005

In [317]:
vocab_size = 4
num_attributes = 4
num_actions = 2
num_guesses = 2

# Turn these off to freeze policies:
a_updates = True
q_updates = False

In [318]:
# A-State: [att * att, vocab * vocab] (+1 for empty vocab)
# Q-State: [vocab * vocab, att * att] (+1 for empty attribute)
a_table = np.zeros((num_attributes, num_attributes, vocab_size+1, vocab_size))
q_table = np.zeros((vocab_size, vocab_size, num_attributes+1, num_attributes+1))

a_visited = np.zeros((num_attributes, num_attributes, vocab_size+1), dtype=bool)
q_visited = np.zeros((vocab_size, vocab_size), dtype=bool)

num_correct = 0

## Accuracy Measurements

In [319]:
final_a_states = []
num_exploits = 0

In [320]:
def groundedness(a_states):
    cooccurences = np.zeros((num_attributes, vocab_size))
    for a_state in a_states:
        cooccurences[a_state[0]][a_state[2]] += 1
        cooccurences[a_state[0]][a_state[3]] += 1
        cooccurences[a_state[1]][a_state[2]] += 1
        cooccurences[a_state[1]][a_state[3]] += 1
        
    sum = 0
    for att in range(num_attributes):
        sum += np.max(cooccurences[att, :])

    sum /= (len(a_states)*num_attributes)
    print("Groundedness: ", sum)

## Q-Learning

In [321]:
def get_reward(a_state, q_state):
    if a_state[0:2] == q_state[2:4]:
        return 1
    else:
        return -10

In [322]:
total_rewards = 0
for episode in range(num_episodes):
#     println(episode)
    # Generate random attributes:
    tradeoff = random.random() # exploration-exploitation
    explore = (tradeoff < epsilon)
    
    a_state = [random.randint(0,3), random.randint(0,3), vocab_size, vocab_size]
    # A-Bot:
    for action in range(num_actions):
        if explore or ~a_visited[a_state[0], a_state[1], a_state[2]]:
            # Explore:
#             print("Exploring...")
            word = random.randint(0, vocab_size-1)
            a_state[action+2] = word
        else:
            # Exploit:
#             print("Exploiting...")
            options = a_table[a_state[0], a_state[1], a_state[2], :]
            word = np.argmax(options)
            a_state[action+2] = word
    
    
    # Q-Bot:
    q_state = [a_state[2], a_state[3], num_attributes, num_attributes]
    if q_visited[q_state[0], q_state[1]]:
        best_guess = (-999, -999)
        best_guess_score = -999999
        for i in range(num_attributes):
            for j in range(num_attributes):
                score = q_table[q_state[0], q_state[1], i, j]
                if score > best_guess_score:
                    best_guess = (i,j)
                    best_guess_score = score
        q_state[2] = best_guess[0]
        q_state[3] = best_guess[1]
    else:
        q_state[2] = random.randint(0,3)
        q_state[3] = random.randint(0,3)
    
    
#     print(a_state)
#     print(q_state)
    reward = get_reward(a_state, q_state)
#     print(reward)
    
    if a_updates:
        a_visited[a_state[0], a_state[1], vocab_size] = True 
        a_table[a_state[0],a_state[1], vocab_size, a_state[2]] += reward

        a_visited[a_state[0], a_state[1], a_state[2]] = True 
        a_table[a_state[0],a_state[1],a_state[2],a_state[3]] += reward
    
    if q_updates:
        q_visited[q_state[0], q_state[1]] = True
        q_table[q_state[0], q_state[1], q_state[2], q_state[3]] += reward
    
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    if (episode % 200 == 0):
        q_updates = ~q_updates
        a_updates = ~a_updates
    if (not explore) and (episode > 15000):
        final_a_states.append(a_state)
        total_rewards += reward
        num_exploits += 1
        
groundedness(final_a_states)
print("Accuracy: ", total_rewards/num_exploits)

Groundedness:  0.2566994344196068
Accuracy:  -7.087799622946404
