In [1]:
!pip install gym



In [2]:
!pip install spyder



In [199]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [200]:
try:
    register(
    id='FrozenLakeNoSlip-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4','is_slippery':False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

#env_name = "CartPole-v1"
#env_name = "MountainCar-v0"
#env_name = "MountainCarContinuous-v0"
#env_name = "Acrobot-v1"
#env_name = "Pendulum-v0"
env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"

env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


In [201]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [202]:
class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State Size: ", self.state_size)
        
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.eps=1.0
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state) 
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros(self.action_size) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99
              
agent = QAgent(env)

Action size: 4
State Size:  16


In [210]:
total_reward = 0;
for ep in range (100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state,reward,done,info=env.step(action)
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        print("state: ", state, "action: ", action)
        print("episode:{}, total reward:{}, eps:{}".format(ep, total_reward, agent.eps))
        env.render()
        print(agent.q_table)
        time.sleep(0.05)
        clear_output(wait=True)

state:  15 action:  2
episode:99, total reward:100.0, eps:0.00032222236288023367
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[7.17136971e-05 3.48844837e-01 3.98840013e-05 2.19641934e-04]
 [2.81043388e-05 3.62371239e-05 5.41210065e-05 2.06779888e-05]
 [3.44146719e-05 2.04599566e-04 3.23112357e-05 6.94984123e-05]
 [4.73332497e-05 3.27353223e-05 4.39672660e-05 8.81381328e-05]
 [5.03165640e-05 5.15443674e-01 1.72967607e-06 5.03120753e-05]
 [6.60546523e-05 8.81917999e-05 8.24786758e-05 1.81502932e-05]
 [7.05672414e-06 2.48846193e-03 8.88428941e-05 4.52745170e-05]
 [6.93050729e-06 9.80075207e-05 9.46698634e-05 2.51809054e-05]
 [1.76544670e-04 4.28894920e-05 6.87701158e-01 5.05497923e-05]
 [6.58589393e-05 7.46085443e-05 8.34225056e-01 8.62988469e-05]
 [8.94274223e-05 9.35200961e-01 4.37068298e-05 4.56033355e-05]
 [2.99385895e-05 7.31629616e-05 3.54613408e-05 1.67414350e-05]
 [4.08362121e-05 7.93796597e-05 1.43597027e-05 2.99861779e-05]
 [4.41864844e-05 4.53833512e-05 4.17030033e-05 9.03726058e-05