In [1]:
import gym
import random
import numpy as np

### Enviroments :

> CartPole-v0

> Acrobot-v1

> MountainCar-v0

> FrozenLake-v0
>> S:Start  |  F:Frozen  |   H:Hole  |G:Goal

# FrozenLake

In [2]:
env = gym.make('FrozenLake-v1')

##### posible action : 
- left , right , Top , bottom

In [3]:
env.action_space.n

4

##### Enviroment space : 
- 4*4 gride (16 tiles)

In [4]:
env.observation_space

Discrete(16)

##### Initialize Learning Rate :

In [5]:
alpha = 0.4 

##### Initialize Discount Factor :

In [6]:
gamma = 0.999

##### Initialize Q_Table :

In [7]:
### key is current state

q_table = dict([x,[1,1,1,1]] for x in range(16))
q_table

{0: [1, 1, 1, 1],
 1: [1, 1, 1, 1],
 2: [1, 1, 1, 1],
 3: [1, 1, 1, 1],
 4: [1, 1, 1, 1],
 5: [1, 1, 1, 1],
 6: [1, 1, 1, 1],
 7: [1, 1, 1, 1],
 8: [1, 1, 1, 1],
 9: [1, 1, 1, 1],
 10: [1, 1, 1, 1],
 11: [1, 1, 1, 1],
 12: [1, 1, 1, 1],
 13: [1, 1, 1, 1],
 14: [1, 1, 1, 1],
 15: [1, 1, 1, 1]}

### helper func. to choose the next action based on the current observation

In [8]:
def choose_action(observ):
    return np.argmax(q_table[observ]) ## action with highest Q-Value

## we know nothing about enviroment , lets explore :

### Each reset of enviroment can be considered to be one episode
- an episode is start of a new task , and this task is seen through until the end
- we consider a maximum of 10,000 time intervals for each episode
- at start of each episode we do not have any value of prev_observ & prev_action
- run 2500 time steps in each episode
- an episode is considered complete
- r=1.0 implies that we reach the goal tile
- r=0.0 implies we fail
> if we reach the goal (retrieve frisbee) or if we fall in hole and plummet to our death or if we run continuously for 2500 time instance without reaching the goal and whithout dying

> in SARSA method we update Q_Value for the previous state-action combination after executing the action and collecting the actual reward

In [21]:
for i in range(10000):
    observ        = env.reset()
    action        = choose_action(observ)
    prev_observ   = None
    prev_action   = None
    
    t             = 0
    
    for t in range(2500):
        env.render()
        
        ### observation is next state because of our action
        ### reward is for that particular action
        observ , reward , done , info = env.step(action)
        action    = choose_action(observ)
        
        ### update Q_Values in Q_Table provided we had a previous state
        if not prev_observ is None :
            
            q_old = q_table[prev_observ][prev_action]
            q_new = q_old
            
            ### if the episode is complete we calc new Q_value for previous state-action comb.
            ### this is simplified formula that only contain reward for current action 
            ### if the episode end there is no future reward to discount
            if done :
                q_new += alpha * (reward - q_old)
            
            ### if the episode isn't done yet we use SARSA 
            ### Remember SARSA uses Actual reward obtained to calc. Q_Value for prev state
            else :
                q_new += alpha * (reward + gamma * q_table[observ][action]-q_old)
                
            
            new_table              = q_table[prev_observ]
            new_table[prev_action] = q_new
            q_table[prev_observ]   = new_table
            #___________________ or___________________________!
            #q_table[prev_observ][prev_action] = q_new
        
        prev_observ = observ
        prev_action = action
        
        if done :
            print("Episode {} finished after {} timesteps with r={}.".format(i,t,reward))
            break

[![dis.png](https://i.postimg.cc/hG0FmYpf/dis.png)](https://postimg.cc/KK4sS0X2)

<h4><font color=red>also the action says left but with small probability some other action might be taken (for exploration) so to goes down</font></h4>

### The last update that we made to particular Q_Value

In [14]:
new_table

[0.29651635792390385,
 0.32891410132554755,
 0.9941406608565958,
 0.3211242719374764]

### Fully populated Q_Table :
- Q_Values computed using SARSA method for every state-action combination

In [19]:
q_table

{0: [0.616709988569774,
  0.15437045295331192,
  0.1647438033245746,
  0.16674122440969114],
 1: [0.09741929108316053,
  0.09723723932205154,
  0.09096514983655313,
  0.6861603434743044],
 2: [0.11728984582734778,
  0.12773695601909563,
  0.0994933777194287,
  0.7323632235970641],
 3: [0.09651028021773994,
  0.12005512575970613,
  0.12944454219970558,
  0.7258811585911],
 4: [0.6453960767075267,
  0.12954815999999997,
  0.1032919938882575,
  0.09600963782156191],
 5: [1, 1, 1, 1],
 6: [0.007016734361078387,
  0.005992242425440991,
  0.48449702394137384,
  0.006903589158444359],
 7: [1, 1, 1, 1],
 8: [0.09623518823465951,
  0.11015373425147809,
  0.12941835706015267,
  0.6749081330801756],
 9: [0.10662202834920921,
  0.7990166702474116,
  0.10392309758111536,
  0.11297635549382402],
 10: [0.6876114871375615,
  0.040782636827680935,
  0.05798380499977701,
  0.06102741687492574],
 11: [1, 1, 1, 1],
 12: [1, 1, 1, 1],
 13: [0.19813718308674946,
  0.21591359999999996,
  0.9503445002033969,


In [16]:
env.close()

# Attachment :

[Project Malmo](https://www.microsoft.com/en-us/research/project/project-malmo/)

[Deepmind Lab](https://deepmind.com/research/publications/2019/deepmind-lab)

[VIZDOOM](http://vizdoom.cs.put.edu.pl/)

[gym](https://gym.openai.com/)