# Tabular Q-Learning
Attribute-value reference game environment (similar to [Kottur, et al.](https://arxiv.org/abs/1706.08502)) with tabular Q-learning.

### Parameters

In [214]:
num_episodes = 30000
eta = 0.8
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005;

In [215]:
vocab_size = 8
num_attributes = 4
num_actions = 2
num_guesses = 2;

In [216]:
# A-State: [att * att, vocab * vocab] (+1 for empty vocab)
# Q-State: [vocab * vocab, att * att] (+1 for empty attribute)
a_table = zeros(num_attributes, num_attributes, vocab_size+1, vocab_size+1, vocab_size)
q_table = zeros(vocab_size, vocab_size, num_attributes+1, num_attributes+1, num_attributes)

a_visited = falses(num_attributes, num_attributes, vocab_size+1, vocab_size+1)
q_visited = falses(vocab_size, vocab_size, num_attributes+1, num_attributes+1)

num_correct = 0;

### Q-Learning
Iterate over episodes and update the Q-tables accordingly:

 * **A-state** - contains randomly selected attributes and previously uttered vocabulary items
 * **Q-state** - contains heard vocabulary items

The goal is for A-bot to communicate its observed attributes to Q-bot. Loosely based on [FreeCodeCamp tutorial](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb).

In [217]:
function get_reward(a_state, q_state)
    if a_state[1:2] == q_state[3:end]
        return 1
#     elseif a_state[1] == q_state[3]
#         return 0.5
#     elseif a_state[2] == q_state[4]
#         return 0.5 
    else
        return -1
    end
end

get_reward (generic function with 1 method)

In [218]:
total_rewards = 0
for episode in 1:num_episodes
#     println(episode)
    # Generate random attributes:
    
    a_state = [rand(1:4), rand(1:4), vocab_size+1, vocab_size+1]
    a_states = [copy(a_state)]
    # A-Bot:
    for action in 1:num_actions
        tradeoff = rand() # exploration-exploitation
        if (tradeoff < epsilon) | ~a_visited[a_state...]
            # Explore:
            word = rand(1:vocab_size)
            a_state[action+2] = word
        else
            # Exploit:
#             println("Exploiting A-bot...")
            options = a_table[a_state[1], a_state[2], a_state[3], a_state[4], :]
#             print(options)
            word = argmax(options)
            a_state[action+2] = word
        end
        push!(a_states, copy(a_state))
    end
        
    # Q-Bot:
    q_state = [a_state[3], a_state[4], num_attributes+1, num_attributes+1]
    q_states = [copy(q_state)]
    for guess in 1:num_guesses
        tradeoff = rand() # exploration-exploitation
        if (tradeoff < epsilon) | ~q_visited[q_state...]
            # Explore:
            att = rand(1:num_attributes)
            q_state[guess+2] = att
        else
            # Exploit:
#             println("Exploiting Q-bot...")
            options = q_table[q_state[1], q_state[2], q_state[3], q_state[4], :]
            att = argmax(options)
            q_state[guess+2] = att
        end
        push!(q_states, copy(q_state))
    end
    # Rewards and table updates:
#     println(a_state,q_state)
    reward = get_reward(a_state, q_state)
    for idx in 1:3
        state = a_states[idx]
        a_visited[state...] = true
        if idx < 3
            a_table[state[1],state[2],state[3],state[4], a_state[idx+2]] += reward
        end
    end
    for idx in 1:3
        state = q_states[idx]
        q_visited[state...] = true 
        if idx < 3
            q_table[state[1],state[2],state[3],state[4], q_state[idx+2]] += reward
        end
    end
    if episode > 15000
        total_rewards += reward
    end
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*exp(-decay_rate*episode) 
end
println("Accuracy: ", total_rewards/num_episodes)

Accuracy: 0.46686666666666665
