## Семинар 13: "Обучение с подкреплением 1"

ФИО: Иванов Максим Юрьевич


###  FrozenLake


<img src="http://vignette2.wikia.nocookie.net/riseoftheguardians/images/4/4c/Jack's_little_sister_on_the_ice.jpg/revision/latest?cb=20141218030206" alt="a random image to attract attention" style="width: 400px;"/>



In [None]:
import gym
import numpy as np 

#create a single game instance
env = gym.make("FrozenLake-v0")

#start new game
env.reset();

In [None]:
# display the game state
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


### legend

![img](https://cdn-images-1.medium.com/max/800/1*MCjDzR-wfMMkS0rPqXSmKw.png)

### Gym interface

The three main methods of an environment are
* __reset()__ - reset environment to initial state, _return first observation_
* __render()__ - show current environment state (a more colorful version :) )
* __step(a)__ - commit action __a__ and return (new observation, reward, is done, info)
 * _new observation_ - an observation right after commiting the action __a__
 * _reward_ - a number representing your reward for commiting action __a__
 * _is done_ - True if the MDP has just finished, False if still in progress
 * _info_ - some auxilary stuff about what just happened. Ignore it for now

In [None]:
print("initial observation code:", env.reset())
print('printing observation:')
env.render()
print("observations:", env.observation_space, 'n=', env.observation_space.n)
print("actions:", env.action_space, 'n=', env.action_space.n)

('initial observation code:', 0)
printing observation:

[41mS[0mFFF
FHFH
FFFH
HFFG
('observations:', Discrete(16), 'n=', 16)
('actions:', Discrete(4), 'n=', 4)


In [None]:
print("taking action 2 (right)")
new_obs, reward, is_done, _ = env.step(2)
print("new observation code:", new_obs)
print("reward:", reward)
print("is game over?:", is_done)
print("printing new state:")
env.render()

taking action 2 (right)
('new observation code:', 4)
('reward:', 0.0)
('is game over?:', False)
printing new state:
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG


In [None]:
action_to_i = {
    'left':0,
    'down':1,
    'right':2,
    'up':3
}

### Попробуйте походить по замерзшему озеру, не упав в дырку. 
* Каждый шаг вы с вероятностью __0.5__ будете двигаться в выбранном направлении и с вероятностью __0.5__ в случайном.
* Если упадете, используйте __env.reset()__ чтобы перезапустить __env__

In [None]:
env.reset()

0

In [None]:
env.step(action_to_i['right'])
env.render()

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


## Задание: 
Следуя шаблонам функций реализуйте алгоритм Policy iteration.
Протестируйте его с помощью функции __evaluate_policy__.
Попробуйте разные значение для gamma и сравните результаты.

In [None]:
def run_episode(env, policy, gamma=1.0, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward

In [None]:
def evaluate_policy(env, policy, gamma=1.0,  n=100):
    scores = [
            run_episode(env, policy, gamma=gamma, render=False)
            for _ in range(n)]
    return np.mean(scores)

In [None]:
env.env.P[14]

{0: [(0.3333333333333333, 10, 0.0, False),
  (0.3333333333333333, 13, 0.0, False),
  (0.3333333333333333, 14, 0.0, False)],
 1: [(0.3333333333333333, 13, 0.0, False),
  (0.3333333333333333, 14, 0.0, False),
  (0.3333333333333333, 15, 1.0, True)],
 2: [(0.3333333333333333, 14, 0.0, False),
  (0.3333333333333333, 15, 1.0, True),
  (0.3333333333333333, 10, 0.0, False)],
 3: [(0.3333333333333333, 15, 1.0, True),
  (0.3333333333333333, 10, 0.0, False),
  (0.3333333333333333, 13, 0.0, False)]}

In [None]:
# Используя фиксированную value_function и жадную стратегию получите policy

def extract_policy(v, gamma = 1.0):
    policy = np.zeros(env.env.nS)
    q = np.zeros((env.env.nS, env.action_space.n))
    for state in env.env.P.keys():
        for action in env.env.P[state].keys():
            for prob, new_state, reward, _ in env.env.P[state][action]:
                q[state, action] += prob * reward + prob * gamma * v[new_state]
        policy[state] = np.argmax(q[state, :])
    return policy

In [None]:
# Используя фиксированную policy с помощью сэмплирования получите оценку для value_function с точностью 1e-10

def compute_policy_v(env, policy, gamma=1.0):
    v = np.zeros(env.env.nS)
    v_prev = np.ones(env.env.nS)
    while np.linalg.norm(v - v_prev) > 1e-10:
        v_prev = v.copy()
        for state in env.env.P.keys():
            v[state] = np.sum([prob * (reward + gamma * v_prev[new_state]) 
                               for prob, new_state, reward, _ in env.env.P[state][policy[state]]])
    return v

In [None]:
def policy_iteration(env, gamma=1.0):
    max_iterations = 100000
    policy = np.random.choice(np.arange(env.env.nA), env.env.nS)
    for i in range(max_iterations):
        old_policy_v = compute_policy_v(env, policy, gamma)
        new_policy = extract_policy(old_policy_v, gamma)
        if (np.all(policy == new_policy)):
            break

        policy = new_policy
    return policy

In [None]:
for gamma in np.linspace(0.01, 1, 10):
    env = gym.make("FrozenLake-v0")
    optimal_policy = policy_iteration(env, gamma=gamma)
    policy_score = evaluate_policy(env, optimal_policy, gamma, n=100)

    print(gamma, policy_score)

(0.01, 2.000001020102021e-18)
(0.12, 3.082947052900832e-08)
(0.23, 3.767846151938489e-07)
(0.34, 6.23631901707355e-05)
(0.45, 0.0001263003025203008)
(0.56, 0.0005809434610856999)
(0.67, 0.001966552462138066)
(0.78, 0.006675637703661324)
(0.89, 0.06313866775959939)
(1.0, 0.76)
