# RL Simple Example

In [1]:
# See that, we are using the correct environment
import sys
sys.executable

'/Users/sandeep/miniconda3/envs/rlenv/bin/python'

## Exploring the NChain-v0 state machine of gym

- Some Imports

In [2]:
import numpy as np
import gym
import random

# Source code for this environment
- https://github.com/openai/gym/blob/master/gym/envs/toy_text/nchain.py

# Details about the environment

This env presents moves along a linear chain of states.

## *Two actions*: Forward (0) and Backward(1)
### 1. **Forward**, which moves along the chain but returns no reward
### 2. **Backward**, which returns to the beginning and has a small reward

## *Total States*: *5 States* (0 to 4)

## Reward
### 1. Backward action gives reward of 1.
### 2. Forward gives no reward until the final stage. Reward of staying in state 4 is 100.

In [3]:
# Create the environment

from nchain import *
env = NChainEnv(slip=0, small=1, large=100)

# Reset is used to initialize the env. Often it sets the environment to the begining state

In [4]:
Starting_State = env.reset()
print("Starting State is: ",Starting_State)

Starting State is:  0


# Goal: Move throught the chain. Print states and the reward

## env.step(action), returns a tuple
- Tuple is: (new state, reward, done, info)
- Environment gives done after 1000 steps

# Execute 10 forward actions
- After reaching the state 5, agent stays there.

In [5]:
Done = False
step = 0

cur_state = env.reset()

for i in range(10):
    
    #execute forward action
    action = 0
    
    next_state,r,Done,_ = env.step(action)
    
    step = step + 1
    print("Step: ",step, "cur_state:",cur_state, " Action:",action," next_state:",next_state," Reward: ",r," Done: ",Done)
    cur_state = next_state

Step:  1 cur_state: 0  Action: 0  next_state: 1  Reward:  0  Done:  False
Step:  2 cur_state: 1  Action: 0  next_state: 2  Reward:  0  Done:  False
Step:  3 cur_state: 2  Action: 0  next_state: 3  Reward:  0  Done:  False
Step:  4 cur_state: 3  Action: 0  next_state: 4  Reward:  0  Done:  False
Step:  5 cur_state: 4  Action: 0  next_state: 4  Reward:  100  Done:  False
Step:  6 cur_state: 4  Action: 0  next_state: 4  Reward:  100  Done:  False
Step:  7 cur_state: 4  Action: 0  next_state: 4  Reward:  100  Done:  False
Step:  8 cur_state: 4  Action: 0  next_state: 4  Reward:  100  Done:  False
Step:  9 cur_state: 4  Action: 0  next_state: 4  Reward:  100  Done:  False
Step:  10 cur_state: 4  Action: 0  next_state: 4  Reward:  100  Done:  False


# Execute 10 backward actions
- Agent stays in state 0

In [6]:
Done = False
step = 0

cur_state = env.reset()

for i in range(10):
    
    #execute backward action
    action = 1
    
    next_state,r,Done,_ = env.step(action)
    
    step = step + 1
    print("Step: ",step, "cur_state:",cur_state, " Action:",action," next_state:",next_state," Reward: ",r," Done: ",Done)
    cur_state = next_state

Step:  1 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  2 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  3 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  4 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  5 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  6 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  7 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  8 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  9 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  10 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False


# Execute 10 random actions

In [14]:
Done = False
step = 0

cur_state = env.reset()

for i in range(10):
    
    #execute random actions
    action = random.randint(0,1)
    
    next_state,r,Done,_ = env.step(action)
    
    step = step + 1
    print("Step: ",step, "cur_state:",cur_state, " Action:",action," next_state:",next_state," Reward: ",r," Done: ",Done)
    cur_state = next_state

Step:  1 cur_state: 0  Action: 0  next_state: 1  Reward:  0  Done:  False
Step:  2 cur_state: 1  Action: 0  next_state: 2  Reward:  0  Done:  False
Step:  3 cur_state: 2  Action: 0  next_state: 3  Reward:  0  Done:  False
Step:  4 cur_state: 3  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  5 cur_state: 0  Action: 0  next_state: 1  Reward:  0  Done:  False
Step:  6 cur_state: 1  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  7 cur_state: 0  Action: 1  next_state: 0  Reward:  1  Done:  False
Step:  8 cur_state: 0  Action: 0  next_state: 1  Reward:  0  Done:  False
Step:  9 cur_state: 1  Action: 0  next_state: 2  Reward:  0  Done:  False
Step:  10 cur_state: 2  Action: 1  next_state: 0  Reward:  1  Done:  False


# Value function evaluation
## 1. Fix the policy
## 2. Execute the policy for some steps and find the value of each state

# Value function for state: 0 using forward policy

In [15]:
env.set_state(0)

num_steps = 1000

reward = 0
discount = 1.0
factor = 0.99

for i in range(num_steps):
    #execute forward action: 0
    action = 0
    next_state,r,Done,_ = env.step(action)
    
    #discounted sum of rewards
    reward = reward + discount*r
    discount = discount * factor
    
print("Value Function is:",reward)
print("Discount is:",discount)

Value Function is: 9605.528387525903
Discount is: 4.317124741065784e-05


# Value function for state: 0 using random policy

In [21]:
num_episodes = 500
Total_reward = 0
start_state = 0

for j in range(num_episodes):

    env.set_state(start_state)
    num_steps = 1000

    reward = 0
    discount = 1.0
    factor = 1.0

    for i in range(num_steps):
        #execute random actions
        action = random.randint(0,1)

        next_state,r,Done,_ = env.step(action)
        
        #discounted sum of rewards
        reward = reward + discount*r
        discount = discount * factor
    
    Total_reward = Total_reward+reward
    
print("Value Function is:",Total_reward/num_episodes)

Value Function is: 3625.376


# Value function for state: 0 using backward policy

In [24]:
env.set_state(start_state)

num_steps = 1000

reward = 0
discount = 1.0
factor = 1.0

for i in range(num_steps):
    #execute backward action: 1
    action = 1
    
    next_state,r,Done,_ = env.step(action)
    reward = reward + discount*r
    discount = discount * factor
    
print("Value Function is:",reward)
print("Discount is:",discount)

Value Function is: 1000.0
Discount is: 1.0


# Value function for state: 4 using random policy

In [27]:
num_episodes = 500
Total_reward = 0
start_state = 4

env.reset()

for j in range(num_episodes):

    env.set_state(start_state)
    num_steps = 1000

    reward = 0
    discount = 1.0
    factor = 1.0

    for i in range(num_steps):
        #execute random actions
        action = random.randint(0,1)

        next_state,r,Done,_ = env.step(action)
        reward = reward + discount*r
        discount = discount * factor
    
    Total_reward = Total_reward+reward
    
print("Value Function is:",Total_reward/num_episodes)

Value Function is: 3670.924


# Discuss the observations

# Iterative value function calculation
# Policy:  Both actions with equal probability

In [28]:
action_0 = 0.5
action_1 = 1-action_0

discount = 1.0

def Calculate_Value_Function(env, itr=1000):
    
    #initialize the value function
    v_table = np.zeros((5))
    
    for i in range(itr):
        
        #5 states
        for j in range(5):
            
            env.set_state(j)
            #next state with action 0
            next_state_0,r_0,_,_ = env.step(0)
            
            
            env.set_state(j)
            #next state with action 1
            next_state_1,r_1,_,_ = env.step(1)
            
            v_table[j] = (action_0*r_0+action_1*r_1) + discount*(action_0*v_table[next_state_0]+ action_1*(v_table[next_state_1]))
        #print(i,v_table)
    return v_table

In [29]:
v_table = Calculate_Value_Function(env,itr=1000)
print(v_table)

[4815.05555556 4823.72222222 4836.22222222 4861.22222222 4911.22222222]


# Policy Improvement

## Random Poliocy
## [4815.05555556 4823.72222222 4836.22222222 4861.22222222 4911.22222222]

## Policy Improvement follow: S0 ---> S1----> S2 ----> S3 ----> S4(Self-Loop) 

# Q Learning

## Steps:
## 1. Start with initial Q values.
## 2. Select the action using the current Q values.
## 3. Take the action, and update the Q values using reward.
## 4. Repeat step 2 and step 3

# Q Learning with Table

In [30]:
def q_learning_with_table(env, num_episodes=500):
    
    # Stores the Q values: 5 states and 2 actions
    q_table = np.zeros((5, 2))
    
    
    gamma = 0.99  # Discount
    alpha = 0.8  # Learning rate
    
    
    for i in range(num_episodes):
        s = env.reset()
        
        #print("episode is: ",i)
        
        steps = 1000
        for i in range(steps):
            
            if np.sum(q_table[s,:]) == 0: 
                
                # we don't which is the better value
                # Make a random selection of actions
                a = np.random.randint(0, 2)
            
            else:
                # select the action with largest q value in state s
                a = np.argmax(q_table[s, :])
            
            # Take the action
            new_s, r, done, _ = env.step(a)
            
            # update the Q value
            q_table[s, a] += r + alpha*(gamma*np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
            
            #print("Steps: ",i, a, new_s, r, done)
           
            
    return q_table

In [31]:
q_learning_with_table(env)

array([[  0., 125.],
       [  0.,   0.],
       [  0.,   0.],
       [  0.,   0.],
       [  0.,   0.]])

### Something is wrong
### There isn’t enough exploration in the training method.

## We are adding exploring now
### Adaptively explore the state space

In [32]:
def explorative_q_learning_with_table(env, num_episodes=500):
    
    q_table = np.zeros((5, 2))
    
    gamma = 0.99  # Discount
    
    eps = 0.99 # Exploration factor
    
    alpha = 0.8  # Learning rate
    
    eps_factor = 0.999 # Exploration decay
    
    for i in range(num_episodes):
        s = env.reset()
        
        eps = eps*eps_factor
        
        steps = 1000
        
        for i in range(steps):
            
            # select the action with highest cummulative reward
            if np.random.random() < eps or np.sum(q_table[s, :]) == 0:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(q_table[s, :])
           
            new_s, r, done, _ = env.step(a)
            q_table[s, a] += r + alpha * (gamma * np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
        
    return q_table

In [33]:
explorative_q_learning_with_table(env)

array([[12007.450125  , 11888.62562375],
       [12128.7375    , 11888.62562375],
       [12251.25      , 11888.62562375],
       [12375.        , 11888.62562375],
       [12500.        , 11888.62562375]])

# Observation: At every state the reward is more for the action '0': Forward

# Policy Decision

# At each state, select the optimal action given the best Q-value.