# Play manually FrozenLake

In [None]:
from gym.utils import play
import gym
env = gym.make("FrozenLake-v0")
play.play(env)
env.close()

# Q-Network Learning

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import gym
import numpy as np

In [None]:
env = gym.make("FrozenLake-v0")
game = qlearning.GamePlayer(env)

In [None]:
env.reset()

In [None]:
state, reward, done, _ = env.step(1)
env.render()
print(state, reward, done)

In [None]:
# Redefine rewards
import types

env.step_backup = env.step
    
def step(self, action):
    state, reward, done, info = self.step_backup(action)
    if state == 15:
        reward = 100
    elif done and reward == 0:
        reward = -20
    elif not done:
        reward = -1
    return state, reward, done, info

env.step = types.MethodType(step, env)

In [None]:
alpha = 0.8
epsilon = 0.9
gamma = 0.1
N = 50
decay_rate=0.0005
total_episodes = 6000

In [None]:
rewards = game.train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery=600)
print("Total reward average:", np.mean(rewards))

In [None]:
qlearning.visualize_computer_playing(1, env, game.q_trained_action)

### Load the environment

In [None]:
# Notebook inspired from: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0
import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env = gym.make('FrozenLake-v0')
env.observation_space

## The Q-Network Approach

### Implementing the network itself

In [None]:
tf.reset_default_graph()

In [None]:
print(game.qtable)

In [None]:
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1,16],dtype=tf.float32)  # using a network of size env.observation_space is confusing
W = tf.Variable(tf.random_uniform([16,4],0,0.01))
Qout = tf.matmul(inputs1,W) # matmulf: matrix multiplication
predict = tf.argmax(Qout,1)

#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[1,4],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

### Training the network

In [None]:
init = tf.initialize_all_variables()

# Set learning parameters
alpha = .99 #learning rate
epsilon = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
nStepBeforeEndList = []
rewardList = []
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        #Reset environment and get first new observation
        state = env.reset()
        totalReward = 0
        done = False
        j = 0
        #The Q-Network
        for step in range(100):
            #Choose an action by greedily (with epsilon chance of random action) from the Q-network
            action,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.identity(16)[state:state+1]})
            if np.random.rand(1) < epsilon:
                action[0] = env.action_space.sample()
            #Get new state and reward from environment
            next_state,reward,done,_ = env.step(action[0])
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout,feed_dict={inputs1:np.identity(16)[next_state:next_state+1]})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0,action[0]] = reward + alpha*maxQ1 # Q(s,a) = rwrd + alpha*Q'(s,a)
            #Train our network using target and predicted Q values
            sess.run([updateModel,W],feed_dict={inputs1:np.identity(16)[state:state+1],nextQ:targetQ})
            totalReward += reward
            state = next_state
            if done == True:
                #Reduce chance of random action as we train the model.
                epsilon = 1./((i/50) + 10)
                break
        nStepBeforeEndList.append(step)
        rewardList.append(totalReward)
print ("Percent of succesful episodes: " + str(sum(rewardList)/num_episodes) + "%")

In [None]:
def tf_play(state):
    init = tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        action = sess.run([predict],feed_dict={inputs1:np.identity(16)[state:state+1]})[0][0]
    return action
        
qlearning.visualize_computer_playing(1, env, tf_play)

### Some statistics on network performance

We can see that the network beings to consistly reach the goal around the 750 episode mark.

In [None]:
plt.plot(rewardList)

It also begins to progress through the environment for longer than chance aroudn the 750 mark as well.

In [None]:
plt.plot(nStepBeforeEndList)