In [5]:
import  gym
import numpy as np
import time
from __future__ import print_function
import tensorflow as tf

In [7]:
def epsilon_greedy(Q, s, na):
    epsilon = 0.3
    p = np.random.uniform(low=0, high=1)
    
    if p > epsilon:
        return np.argmax(Q[s,:])
    else:
        return env.action_space.sample()

Some notes about FrozenLake
https://gym.openai.com/envs/FrozenLake-v0/

Environment is described as a grid of characters e.g

SFFF
FHFH
FFFH
HFFG

S: starting point
F: Frozen lake, safe to navigate but there is a chance that the agent will slide and go in the wrong direction
H: hole - agent dies if it falls here
G: goal - where the agent wants to end up to win

In [19]:
#Setup environment
env = gym.make('FrozenLake-v0')
s = env.reset()
print("initial state: " , s)
print()

env.render()

# Other environments can have different shapes for the observation space e.g (the screen or RAM)
# which has very interesting implications in AI research
# However, if you're making AI for a video game and you can control the action and observation space yourself
# this technique will work just fine
print("Number of actions: ", env.action_space.n )
print("Number of states: ", env.observation_space.n)


[2018-10-13 20:16:35,085] Making new env: FrozenLake-v0


('initial state: ', 0)
()
[41mS[0mFFF
FHFH
FFFH
HFFG

('Number of actions: ', 4)
('Number of states: ', 16)


In [20]:
# Q learning

Q = np.zeros([env.observation_space.n, env.action_space.n])

lr = 0.5 # how quickly you learn
y = 0.9 # discount factor
eps = 10000 # total number of iterations

for i in range(eps):
    s = env.reset()
    t = False
    while True:
        a = epsilon_greedy(Q, s, env.action_space.n)
        # s = state the agent is in
        # r = the reward the agent received
        # t = whether the game is over or not
        s_, r, t, _ = env.step(a)
        if(r == 0):
            if t == True:
                r = -5
                Q[s_] = np.ones(env.action_space.n) * r
            else:
                r = -1 #punish long routes
        if(r==1):
            r = 100 #found the goal
            Q[s_] = np.ones(env.action_space.n) * r
        
        # Weighted average between the old estimate and new estimate of a state action pair
        Q[s,a] = Q[s,a] + lr * (r + y * np.max(Q[s_,a]) - Q[s,a])
        s = s_
        if(t == True):
            break
            

In [21]:
print(Q)

[[  -9.852764     -9.68111881   -9.67810808   -9.99999995]
 [  -9.72523117   -9.68251325   -9.65632413   -9.99999995]
 [  -9.61961312   -9.64961979   -9.640255     -9.99999994]
 [  -9.6338988    -9.52499786   -9.61882611   -9.99999992]
 [  -9.79882065   -9.53937083   -9.66187623   -9.94932264]
 [  -5.           -5.           -5.           -5.        ]
 [  -9.62758867   -9.2106401    -9.33185493   -9.71877684]
 [  -5.           -5.           -5.           -5.        ]
 [  -9.67057912   -9.49487724   -9.56714894   -9.79917382]
 [  -9.60187359   -8.0280717    -8.5707085    -9.60310979]
 [  -9.71266298   47.72740638   -5.56933513   -9.63672357]
 [  -5.           -5.           -5.           -5.        ]
 [  -5.           -5.           -5.           -5.        ]
 [  -9.60897513   -8.55934243   -4.60149797   -8.04984241]
 [  -9.71820133   51.05729801    1.14013016   -6.47730933]
 [ 100.          100.          100.          100.        ]]


In [22]:
s = env.reset()
env.render()
while(True):
    a = np.argmax(Q[s])
    s_,r,t,_ = env.step(a)
    print("============")
    env.render()
    s = s_
    if(t==True):
        break

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Down)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFF[41mH[0m
HFFG
  (Down)


Deep Q learning
Your game probably has a much larger action and observation space than FrozenLake
Main idea of deep Q learning is to approximate the Q matrix using a neural network which is why we'll be looking at it
in the next tutorial