# Implementing reinforcement learning

### Imports

In [1]:
import numpy as np


### Initialize Reward matrix R, learning matrix Q and gamma

In [2]:
# Initialize 
R = np.matrix([ [0,0,0,0,1,0],
                [0,0,0,1,0,1],
                [0,0,100,1,0,0],
                [0,1,1,0,1,0],
                [1,0,0,1,0,0],
                [0,1,0,0,0,0] ])
Q = np.matrix( np.zeros([6,6]) )

gamma = 0.8

### 1. Act

In [42]:
agent_s_state = 1

def possible_actions(state):
    current_state_row = R[state,]
    
    #print(current_state_row)
    #[[0 0 0 1 0 1]]
    
    possible_act = np.where(current_state_row > 0) [1]
    return possible_act

PossibleAction = possible_actions(agent_s_state)
#print(PossibleAction)
#[3 5]

def ActionChoice(available_actions_range):
    next_action = int(np.random.choice(PossibleAction, 1))
    return next_action

action = ActionChoice(PossibleAction)
#print(action)
#3  (or '5' of course => one of the 2 possible actions in state 1)

### 2. Reward

In [43]:
def reward(current_state, action, gamma):
    Max_State = np.where(Q[action,] == np.max(Q[action,]))[1]
    #print(Max_State)
    #[0 1 2 3 4 5]   => can only choose between 6 states
    
    if (Max_State.shape[0] > 1):
        Max_State = int(np.random.choice(Max_State, size = 1))
    else:
        Max_State = int(Max_State)
    MaxValue = Q[action, Max_State]
    #print(MaxValue)
    #0.0    => is first time always 0, because Q is filled with zeros
    
    Q[current_state, action] = R[current_state, action] + gamma * MaxValue
    #print(Q[current_state,])
    #[[0. 0. 0. 1. 0. 0.]]

reward(agent_s_state, action, gamma)

### 3. Repeat

In [49]:
for i in range(50000):
    # Start at a random location
    current_state = np.random.randint(0, int(Q.shape[0]))
    
    # Get the possible actions
    PossibleAction = possible_actions(current_state)
    
    # Choose an action
    action = ActionChoice(PossibleAction)
    
    # Reward
    reward(current_state, action, gamma)
    
    # Print Q after the first 2 actions and rewards (example)
    #print("Q:")
    #print(Q)
    #if i == 1: break
    """[[0.  0.  0.  0.  0.  0. ]
        [0.  0.  0.  1.  0.  1. ]
        [0.  0.  0.  1.  0.  0. ]
        [0.  1.8 0.  0.  0.  0. ]
        [1.  0.  0.  0.  0.  0. ]
        [0.  0.  0.  0.  0.  0. ]]
        
        [[0.   0.   0.   0.   0.   0.  ]
         [0.   0.   0.   1.   0.   1.  ]
         [0.   0.   0.   2.44 0.   0.  ]
         [0.   1.8  0.   0.   0.   0.  ]
         [1.   0.   0.   0.   0.   0.  ]
         [0.   0.   0.   0.   0.   0.  ]]"""
    

In [59]:
print("Latest Q:")
print(Q)
print()

print("Normed Q:")
print(Q/np.max(Q)*100)
print()

print("Binary Q (value 100 is also 1):")
print(np.where(Q > 0, 1, 0))
print()

print("What it should look like (R):")
print(R)
print()

print("Binary Q and Binary R are equal: {}".format(np.array_equal(np.where(Q > 0, 1, 0), np.where(R > 0, 1, 0))))

Latest Q:
[[  0.      0.      0.      0.    258.44    0.   ]
 [  0.      0.      0.    321.8     0.    207.752]
 [  0.      0.    500.    321.8     0.      0.   ]
 [  0.    258.44  401.      0.    258.44    0.   ]
 [207.752   0.      0.    321.8     0.      0.   ]
 [  0.    258.44    0.      0.      0.      0.   ]]

Normed Q:
[[  0.       0.       0.       0.      51.688    0.    ]
 [  0.       0.       0.      64.36     0.      41.5504]
 [  0.       0.     100.      64.36     0.       0.    ]
 [  0.      51.688   80.2      0.      51.688    0.    ]
 [ 41.5504   0.       0.      64.36     0.       0.    ]
 [  0.      51.688    0.       0.       0.       0.    ]]

Binary Q (value 100 is also 1):
[[0 0 0 0 1 0]
 [0 0 0 1 0 1]
 [0 0 1 1 0 0]
 [0 1 1 0 1 0]
 [1 0 0 1 0 0]
 [0 1 0 0 0 0]]

What it should look like (R):
[[  0   0   0   0   1   0]
 [  0   0   0   1   0   1]
 [  0   0 100   1   0   0]
 [  0   1   1   0   1   0]
 [  1   0   0   1   0   0]
 [  0   1   0   0   0   0]]

Binary Q a

### Success

Cool, the binary Q and R are identical. Training finished successfully!