In [1]:
import gym
import numpy as np

In [2]:
env = gym.make("Taxi-v2").env

env.render()

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [3]:
env.reset()
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [17]:
state = env.encode(1, 4, 2, 0)

In [18]:
env.s = state
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [20]:
env.P[state]

{0: [(1.0, 288, -1, False)],
 1: [(1.0, 88, -1, False)],
 2: [(1.0, 188, -1, False)],
 3: [(1.0, 168, -1, False)],
 4: [(1.0, 188, -10, False)],
 5: [(1.0, 188, -10, False)]}

In [86]:
import random
from IPython.display import clear_output

env.s = state

all_epochs = []
all_penalties = []
max_epochs = 2000

q_table = np.zeros([env.observation_space.n,env.action_space.n])

done = False

alpha = 0.2
gamma = 0.8
epsilon = 0.2

for i in range(1,max_epochs):
    state = env.reset()
    epochs, penalties, reward = 0,0,0
    done = False
    while not done:
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
        
        next_state,reward,done,info = env.step(action)
        
        old_value = q_table[state,action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1-alpha)*old_value + alpha*(reward + gamma*next_max)
        q_table[state,action] = new_value
        
        if reward == -10:
            penalties +=1

        state = next_state
        epochs += 1
        
        if i%100 == 0:
            clear_output(wait=True)
            print("Episode: " + str(i))
            print("We had a total of ", str(penalties) + " penalties")
            print("We used ", str(epochs) + " epochs")
        
        #print("Training finished.\n")


Episode: 1900
We had a total of  0 penalties
We used  11 epochs


In [87]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(str(frame['frame']))
        print("Timestep: " + str(i+1))
        print("State: " + str(frame['state']))
        print("Action: " + str(frame['action']))
        print("Reward: " + str(frame['reward']))
        sleep(.1)

In [90]:
total_epochs, total_penalties = 0, 0
episodes = 10

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    frames = []
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1
        
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
        )
    print_frames(frames)
    total_penalties += penalties
    total_epochs += epochs

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 10
State: 85
Action: 5
Reward: 20


KeyboardInterrupt: 

In [77]:
total_penalties

0

In [114]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer,Dense
model = Sequential()
model.add(InputLayer(batch_input_shape=(1, 500)))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(6, activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [122]:
env[312]

TypeError: 'TaxiEnv' object does not support indexing

In [115]:
y = 0.95 #gamma
eps = 0.5
decay_factor = 0.999
r_avg_list = []
num_episodes = 1000
for i in range(num_episodes):
    s = env.reset()
    eps *= decay_factor
    
    if i%100 == 0:
        print("Episode {} of {}".format(i + 1, num_episodes))
    done = False
    r_sum = 0
    while not done:
        if random.uniform(0,1)<eps:
            #a = random.randint(0,2) 
            a = env.action_space.sample()
        else:
            a = np.argmax(model.predict([s:s+1]))
        new_s, r, done , _ = env.step(a)
        target = r + y*np.max(model.predict(np.identity(500)[new_s:new_s + 1]))
        target_vec = model.predict(np.identity(500)[s:s + 1])[0]
        ipdb.set_trace()
        target_vec[a] = target
        model.fit(np.identity(500)[s:s + 1], target_vec.reshape(-1, 6), epochs=1, verbose=0)
        s = new_s
        r_sum += r
    r_avg_list.append(r_sum / 1000)

Episode 1 of 1000
> [0;32m<ipython-input-115-0d58484e4550>[0m(24)[0;36m<module>[0;34m()[0m
[0;32m     23 [0;31m        [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 24 [0;31m        [0mtarget_vec[0m[0;34m[[0m[0ma[0m[0;34m][0m [0;34m=[0m [0mtarget[0m[0;34m[0m[0m
[0m[0;32m     25 [0;31m        [0mmodel[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0midentity[0m[0;34m([0m[0;36m500[0m[0;34m)[0m[0;34m[[0m[0ms[0m[0;34m:[0m[0ms[0m [0;34m+[0m [0;36m1[0m[0;34m][0m[0;34m,[0m [0mtarget_vec[0m[0;34m.[0m[0mreshape[0m[0;34m([0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0;36m6[0m[0;34m)[0m[0;34m,[0m [0mepochs[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0m
[0m
ipdb> model.predict(np.identity(500)[s:s+1])
array([[-0.5831442 ,  0.38320902,  0.0811737 , -0.5032136 , -0.71839315,
        -0.24581699]], dtype=float32)
ipd

BdbQuit: 

In [105]:
s = env2.reset()
s

0

In [104]:
np.identity(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])