In [1]:
import gym
from gym import envs

In [2]:
import numpy as np
import time
from IPython.display import clear_output

In [3]:
from typing import Dict
class QTable:
    def __init__(self, env):
        self.n_observations = env.observation_space.n
        self.n_actions = env.action_space.n
        self.table = np.zeros((self.n_observations, self.n_actions))
        
    def get_action(self, state, epsilon=0.0):
        if np.random.rand() < epsilon:
            return np.random.randint(0, self.n_actions)
        else:
            rewards = self.table[state]
            best_action = np.argmax(rewards)
            return best_action
    
    def update_table(self, prev_state, next_state, action, reward, alpha=0.1, gamma=0.5):
        q = self.table
        ps = prev_state
        ns = next_state
        a = action
        q[ps, a] = q[ps, a] + alpha*(reward + gamma*np.max(q[ns]) - q[ps, a])

    def evaluate(self, env, n):
        scores = []
        for i_game in range(n):
            env.reset()
            total_score = 0
            for turn in range(200):
                ps = env.env.s
                action = self.get_action(ps, 0.0)
                ns, reward, done, info = env.step(action)
                total_score += reward
                if done:
                    break
            scores.append(total_score)
        return np.average(scores)
                
    def animate(self, env, delay=0.5):
        env.reset()
        env.render()
        time.sleep(delay)
        for turn in range(200):
            clear_output(True)
            ps = env.env.s
            action = self.get_action(ps, 0.0)
            ns, reward, done, info = env.step(action)
            env.render()
            print(turn, action)
            if not done or turn == 200:
                time.sleep(delay)
            else:
                break
        
    @classmethod
    def learn_from_env(cls, env, n, alpha, gamma, epsilon):
        table = QTable(env)
        for episode in range(n):
            env.reset()
            total_score = 0
            for turn in range(200):
                ps = env.env.s
                action = table.get_action(ps, epsilon=epsilon)
                ns, reward, done, info = env.step(action)
                total_score += reward
                table.update_table(ps, ns, action, reward, alpha=alpha, gamma=gamma)
                if done:
                    break
            if episode%1000 == 0:
                print(episode, total_score)
        return table

In [4]:
np.random.seed(333)
env = gym.make('Taxi-v3')
table = QTable.learn_from_env(env, n=100000, alpha=0.1, gamma=0.5, epsilon=0.2)



0 -533
1000 0
2000 -6
3000 -16
4000 1
5000 7
6000 -31
7000 -6
8000 -24
9000 -7
10000 9
11000 -29
12000 3
13000 8
14000 -10
15000 -4
16000 5
17000 13
18000 4
19000 3
20000 -21
21000 -3
22000 -2
23000 -3
24000 -25
25000 -5
26000 -2
27000 -8
28000 3
29000 -5
30000 7
31000 5
32000 12
33000 -6
34000 1
35000 8
36000 -1
37000 -13
38000 -2
39000 0
40000 -17
41000 -4
42000 -12
43000 1
44000 -14
45000 7
46000 -19
47000 -13
48000 -3
49000 -24
50000 -37
51000 0
52000 2
53000 9
54000 -10
55000 -5
56000 -5
57000 -5
58000 0
59000 -12
60000 -2
61000 -7
62000 8
63000 -16
64000 5
65000 8
66000 3
67000 2
68000 4
69000 -5
70000 -11
71000 -16
72000 1
73000 4
74000 -8
75000 10
76000 8
77000 -15
78000 -22
79000 9
80000 -6
81000 -18
82000 -9
83000 9
84000 10
85000 -18
86000 -14
87000 11
88000 -11
89000 -12
90000 4
91000 8
92000 9
93000 6
94000 14
95000 -17
96000 -25
97000 -7
98000 4
99000 3


In [5]:
for i in range(200):
    clear_output(True)
    table.animate(env)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
14 5


In [12]:
table.evaluate(env, 1000)

7.875