# Tabular Q-Learning
Attribute-value reference game environment (similar to [Kottur, et al.](https://arxiv.org/abs/1706.08502)) with tabular Q-learning.

### Parameters

In [1]:
num_episodes = 15000
learning_rate = 0.8
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005;

In [2]:
vocab_size = 8
num_attributes = 4
num_actions = 2
num_guesses = 2;

In [3]:
a_table = zeros(num_attributes, num_attributes, vocab_size+1, vocab_size+1, vocab_size)
q_table = zeros(vocab_size+1, vocab_size+1, num_attributes+1, num_attributes+1, num_attributes)

a_visited = falses(num_attributes, num_attributes, vocab_size+1, vocab_size+1)
q_visited = falses(vocab_size+1, vocab_size+1, num_attributes+1, num_attributes+1)

num_correct = 0;

### Q-Learning
Iterate over episodes and update the Q-tables accordingly:

 * **A-state** - contains randomly selected attributes and previously uttered vocabulary items
 * **Q-state** - contains heard vocabulary items

The goal is for A-bot to communicate its observed attributes to Q-bot. Loosely based on [FreeCodeCamp tutorial](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb).

In [9]:
function get_reward(a_state, q_state)
    if a_state[1:2] == q_state[3:end]
        return 1
    else
        return 0
    end
end

get_reward (generic function with 1 method)

In [20]:
total_rewards = 0
for episode in 1:num_episodes
    # Generate random attributes:
    a_state = [rand(1:4), rand(1:4), 1, 1]
    
    # A-Bot:
    for action in 1:num_actions
        tradeoff = rand() # exploration-exploitation
        if (tradeoff > epsilon) | ~a_visited[a_state...]
            # Explore:
            word = rand(1:vocab_size) + 1
            a_state[action+2] = word
        else
            # Exploit:
            options = a_table[a_state[1], a_state[2], a_state[3], a_state[4], :]
            word = argmax(options) + 1
            a_state[action+2] = word
        end
    end
        
    # Q-Bot:
    q_state = a_state[3:end]
    push!(q_state, 1)
    push!(q_state, 1)
    atts = []
    for guess in 1:num_guesses
        tradeoff = rand() # exploration-exploitation
        if (tradeoff > epsilon) | ~q_visited[q_state...]
            # Explore:
            att = rand(1:num_attributes)
            q_state[guess+2] = att
        else
            # Exploit:
            options = q_table[q_state[1], q_state[2], q_state[3], q_state[4], :]
            att = argmax(options) + 1
            q_state[guess+2] = att
        end
    end
    reward = get_reward(a_state, q_state)
    total_rewards += reward
end
print(total_rewards)

917