### CartPole

In [16]:
import sys

sys.path.append('/home/ubuntu/udacity/gym/')

import gym

import numpy as np

In [17]:
env = gym.make('CartPole-v0')
env.reset()

array([-0.00227331, -0.02390831,  0.03799507, -0.00359405])

##### Explore Gym

In [18]:
env.action_space

Discrete(2)

In [19]:
env.observation_space

Box(4,)

In [20]:
env.action_space.sample()


0

In [21]:
env

<TimeLimit<CartPoleEnv<CartPole-v0>>>

##### Run a few episodes -- reseting game after each

In [22]:
episodes = 10

for ep in range(episodes):
    episode_rewards = 0
    time_steps = 0
    while True:
        time_steps += 1
        action = env.action_space.sample()
        state,reward,done,info = env.step(action)
        episode_rewards += reward
        if done:
            print("Episode {} took {} steps and got {} rewards".format(ep,time_steps,episode_rewards))
            env.reset()
            break

Episode 0 took 20 steps and got 20.0 rewards
Episode 1 took 32 steps and got 32.0 rewards
Episode 2 took 16 steps and got 16.0 rewards
Episode 3 took 58 steps and got 58.0 rewards
Episode 4 took 15 steps and got 15.0 rewards
Episode 5 took 12 steps and got 12.0 rewards
Episode 6 took 16 steps and got 16.0 rewards
Episode 7 took 23 steps and got 23.0 rewards
Episode 8 took 17 steps and got 17.0 rewards
Episode 9 took 21 steps and got 21.0 rewards


#### Policy Evaluation

In [23]:
env.reset()
env = gym.make('FrozenLake-v0')
env = env.unwrapped
#env.P

In [24]:
env.P[1][0]

[(0.3333333333333333, 1, 0.0, False),
 (0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 5, 0.0, True)]

In [25]:
def policy_evaluation(states, policy, env_model, discount, theta=0.0001):
    delta = theta*2
    state_len = env.nS
    action_len = env.nA
    
    while delta > theta:
        delta = 0
        for s in range(state_len):
            new_s = 0.
            for a in range(action_len):
                transitions_list = env_model[s][a]
                for i in transitions_list:
                    transition_prob, next_state, reward, done = i
                    if done:
                        new_s += policy[s,a]*transition_prob*reward
                    else:
                        new_s += policy[s,a]*transition_prob*(reward + discount*states[next_state])
            delta = max(delta, np.abs(new_s - states[s]))
            states[s] = new_s
    
    
    return states
            

In [26]:
gamma = 0.99
state_size = env.nS
action_size = env.nA

state_value_array = np.zeros(state_size)

policy_array = np.ones((state_size, action_size))/ action_size

state_value_array = policy_evaluation(state_value_array, policy_array, env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")

Reshaped State Value Estimates with gamma of 0.99:
[[0.012 0.01  0.019 0.009]
 [0.015 0.    0.039 0.   ]
 [0.033 0.084 0.138 0.   ]
 [0.    0.17  0.434 0.   ]]



#### Using MDP Framework with Policy Evaluation

##### Policy Improvement: FrozenLake

In [27]:
def policy_improvement(states, policy, env_model, discount):
    policy_stable = True
    state_len  = env.nS
    action_len = env.nA
    
    for s in range(state_len):
        old_action = np.argmax(policy[s])
        temp_array = np.zeros((action_len))
        for a in range(action_len):
            transitions_list = env_model[s][a]
            for i in transitions_list:
                transitions_prob, next_state, reward, done = i
                if done:
                    temp_array[a] += transitions_prob*reward
                else:
                    temp_array[a] += transitions_prob*(reward + discount*states[next_state])
        policy[s] = np.zeros((action_len))
        policy[s, np.argmax(temp_array)] = 1.
        
        if old_action != np.argmax(policy[s]):
            policy_stable = False
        
    return policy_stable, states, policy


def policy_iteration(env_model, discount, theta=0.0001):
    
    policy = np.ones((env.nS, env.nA))/env.nA
    
    states = np.zeros(env.nS)
    
    policy_stable = False
    
    while not policy_stable:
        
        states = policy_evaluation(states, policy, env_model, discount, theta)
        
        policy_stable, states, policy = policy_improvement(states, policy, env_model, discount)
        
        
    return states, policy

In [28]:
env = gym.make('FrozenLake-v0', is_slippery=False)
#env = gym.make('FrozenLakeNotSlippery-v0')

gamma = 0.99

state_value_array, policy_array = policy_iteration(env.P, gamma)


print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")

print("Reshaped Policy Array {}:")
print(np.round(policy_array.reshape(16,4),3))
print("")

Reshaped State Value Estimates with gamma of 0.99:
[[0.951 0.961 0.97  0.961]
 [0.961 0.    0.98  0.   ]
 [0.97  0.98  0.99  0.   ]
 [0.    0.99  1.    0.   ]]

Reshaped Policy Array {}:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]



#### Policy Improvement Taxi

In [29]:
env = gym.make('Taxi-v3')
env.reset()

for i in range(5):
    env.render()
    state,reward,done,info = env.step(env.action_space.sample())
    if done:
        print("Episode is done, resetting the environment")
        env.reset()
env.close()

+---------+
|[35mR[0m: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|[35mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
