In [1]:
import gym
print(gym.__version__)

0.15.3


In [4]:
import numpy as np
import time
import random
from IPython.display import clear_output
from tqdm import tqdm_notebook as tqdm

In [52]:
env= gym.make('FrozenLake-v0')

In [53]:
state_space_size= env.observation_space.n
action_space_size= env.action_space.n
q_table= np.zeros((state_space_size, action_space_size))
q_table.shape

(16, 4)

In [54]:
num_episodes= 1000000
max_steps_per_episode= 100

learning_rate= 0.01
discount_rate= 0.99

exploration_rate= 1
max_exploration_rate= 1
min_exploration_rate= 0.01
exploration_decay_rate= 0.001

In [58]:
# Q-learning implementation:
rewards_all_episode= []

for episode in tqdm(range(num_episodes)):
    # initialize a new episode
    state= env.reset()
    done= False
    reward_current_episode= 0

    for step in range(max_steps_per_episode):
        exploration_rate_threshold= random.uniform(0,1)
        # if random choosen task by the agent is exploration
        if exploration_rate_threshold > exploration_rate:
            action= np.argmax(q_table[state, :])
        # if randomly choosen task by the agent is exploitation
        else:
            action= env.action_space.sample()
        
        
        # taking action
        new_state, reward, done, info= env.step(action)
        # update the q table based on the reward
        q_table[state, action]= ((1 - learning_rate) * q_table[state, action]) +\
        (learning_rate * (reward + discount_rate * np.max(q_table[new_state, :])))
        
        # transition to the next state
        state= new_state
        reward_current_episode += reward
        
        # exit episode
        if done == True:
            break
            
    # exploration rate exponential decay
    exploration_rate= min_exploration_rate +\
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    rewards_all_episode.append(reward_current_episode)
    
#     # calculate and print average reward per thousand
#     if episode % 1000 == 0 and episode != 0:
#         print('Reward on %d th episode : %.2f, final reward score: %.2f'\
#               % (episode, reward_current_episode, sum(rewards_all_episode)/episode ))

        
        
                

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))




In [59]:
# Calculate and print the average reward per thousand episodes
rewards_per_thosand_episodes = np.split(np.array(rewards_all_episode), num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thosand_episodes:
    print(count, ": %.3f" % (sum(r/1000)))
    count += 1000


********Average reward per thousand episodes********

1000 : 0.025
2000 : 0.065
3000 : 0.210
4000 : 0.267
5000 : 0.282
6000 : 0.266
7000 : 0.288
8000 : 0.279
9000 : 0.280
10000 : 0.291
11000 : 0.288
12000 : 0.320
13000 : 0.318
14000 : 0.301
15000 : 0.302
16000 : 0.560
17000 : 0.538
18000 : 0.545
19000 : 0.544
20000 : 0.537
21000 : 0.511
22000 : 0.529
23000 : 0.524
24000 : 0.555
25000 : 0.540
26000 : 0.531
27000 : 0.557
28000 : 0.540
29000 : 0.538
30000 : 0.551
31000 : 0.536
32000 : 0.526
33000 : 0.549
34000 : 0.567
35000 : 0.545
36000 : 0.546
37000 : 0.548
38000 : 0.556
39000 : 0.535
40000 : 0.529
41000 : 0.527
42000 : 0.558
43000 : 0.551
44000 : 0.516
45000 : 0.549
46000 : 0.551
47000 : 0.550
48000 : 0.552
49000 : 0.552
50000 : 0.539
51000 : 0.540
52000 : 0.579
53000 : 0.566
54000 : 0.590
55000 : 0.602
56000 : 0.595
57000 : 0.576
58000 : 0.573
59000 : 0.601
60000 : 0.599
61000 : 0.628
62000 : 0.598
63000 : 0.591
64000 : 0.620
65000 : 0.610
66000 : 0.674
67000 : 0.674
68000 : 0.690
690

550000 : 0.683
551000 : 0.701
552000 : 0.711
553000 : 0.655
554000 : 0.694
555000 : 0.661
556000 : 0.665
557000 : 0.696
558000 : 0.678
559000 : 0.694
560000 : 0.677
561000 : 0.684
562000 : 0.657
563000 : 0.688
564000 : 0.681
565000 : 0.650
566000 : 0.667
567000 : 0.689
568000 : 0.702
569000 : 0.649
570000 : 0.707
571000 : 0.688
572000 : 0.669
573000 : 0.690
574000 : 0.712
575000 : 0.680
576000 : 0.707
577000 : 0.693
578000 : 0.690
579000 : 0.673
580000 : 0.672
581000 : 0.679
582000 : 0.694
583000 : 0.660
584000 : 0.677
585000 : 0.673
586000 : 0.704
587000 : 0.664
588000 : 0.685
589000 : 0.699
590000 : 0.659
591000 : 0.672
592000 : 0.689
593000 : 0.701
594000 : 0.680
595000 : 0.693
596000 : 0.691
597000 : 0.664
598000 : 0.682
599000 : 0.685
600000 : 0.685
601000 : 0.684
602000 : 0.706
603000 : 0.665
604000 : 0.673
605000 : 0.705
606000 : 0.673
607000 : 0.697
608000 : 0.687
609000 : 0.683
610000 : 0.669
611000 : 0.680
612000 : 0.692
613000 : 0.691
614000 : 0.695
615000 : 0.679
616000 : 0

In [60]:
# Interpreting the results
q_table

array([[0.52960222, 0.50696265, 0.50504294, 0.49757901],
       [0.31466148, 0.3183473 , 0.30041897, 0.44279878],
       [0.37440354, 0.3143898 , 0.28653805, 0.33001418],
       [0.17500452, 0.02335419, 0.01221098, 0.02994933],
       [0.54795294, 0.36492849, 0.37542263, 0.35341444],
       [0.        , 0.        , 0.        , 0.        ],
       [0.27228914, 0.188166  , 0.33530807, 0.11939158],
       [0.        , 0.        , 0.        , 0.        ],
       [0.35140246, 0.40766645, 0.36918665, 0.57685318],
       [0.44996111, 0.62552095, 0.45145697, 0.36918563],
       [0.59748698, 0.48457239, 0.40913522, 0.32257495],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.43515616, 0.50202965, 0.73987762, 0.47832563],
       [0.72828018, 0.8552339 , 0.80261344, 0.77744184],
       [0.        , 0.        , 0.        , 0.        ]])

In [65]:
# watch our agent play the frozen lake by playing the best action
for episode in range(3):
    # initialize the new episode
    done= False
    state= env.reset()
    for step in range(max_steps_per_episode):
        # show current step
        # choose action and apply
        clear_output(wait= True)
        env.render()
        time.sleep(0.1)
        action= np.argmax(q_table[state, :])
        new_state, reward, done, info= env.step(action)      
        
        if step == max_steps_per_episode-1:
            print('The Agent is dumb and haven\'t figured out in % steps' % (max_steps_per_episode))

        if done:
            if reward == 1:
                print('Congratulations, the agent has reach the Goal in %d steps' % (step))
                time.sleep(5)
            else:
                print('Oh, the agent fell into the lake in %d steps' % (step))
                time.sleep(5)
            break
        # set new state
        state= new_state
    clear_output(wait= True)
env.close()
    

  (Right)
SFFF
FHFH
FFFH
HFFG
Congratulations, the agent has reach the Goal in 51 steps
