# SARSA Learning

## Imports

In [3]:
import numpy as np 
import gym 

SARSA Function iterative step:   
  
$ \large s_t $ : state at time t   
$ \large a_t $ : action at time t    
$ \large r_t $ : reward at time t  
$ \large \alpha $ : learning rate. Determines to what extent newly acquired information overrides old information      
$ \large \gamma $ : discount factor. Determines the importance of future rewards (0 is short-sighted, 1 long-sighted)   
  
$ \large Q(s_t, a_t) = Q(s_t, a_t) + \alpha \; [\; r_{t+1} + \gamma \; Q(s_{t+1}, a_{t+1}) - Q(s_{t}, a_{t}) ]$

## Build the environment

In [7]:
env = gym.make('FrozenLake-v0') 

epsilon = 0.9
total_episodes = 10000
max_steps = 100
alpha = 0.85
gamma = 0.95
  
#Initialize Q-matrix 
Q = np.zeros((env.observation_space.n, env.action_space.n)) 

## Helpers

In [11]:
def choose_action(s): 
    a=0
    if np.random.uniform(0, 1) < epsilon: 
        a = env.action_space.sample() 
    else: 
        a = np.argmax(Q[s, :]) 
    return a 
  
def updateQvalue(s1, a1, r, s2, a2): 
    predict = Q[s1, a1] 
    target = r + gamma * Q[s2, a2] 
    Q[s1, a1] = Q[s1, a1] + alpha * (target - predict) 


## Training

A loop showing training (render), get next state, choose next action and update Q value and reward

In [None]:
reward=0
  
for episode in range(total_episodes): 
    t = 0
    s1 = env.reset() 
    a1 = choose_action(s1) 
  
    while t < max_steps: 
        env.render() 
        s2, r, done, info = env.step(a1) 
        a2 = choose_action(s2) 
        updateQvalue(s1, a1, r, s2, a2) 
  
        s1 = s2 
        a1 = a2 
          
        t += 1
        r += 1
          
        if done: 
            break


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Down)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFF[41mF[0m
FHFH
FFFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF

## Validation

In [None]:
print ("Performace : ", reward/total_episodes, '\n\n') 
Q 

## Credits & Links

https://www.geeksforgeeks.org/sarsa-reinforcement-learning/