In [1]:
import gym

In [2]:
env = gym.make("Taxi-v3").env

In [3]:
env.render()
# Blue represents pessenger current location
# Purple is destination
# Yellow is Taxi(agent)

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[35mY[0m| : |B: |
+---------+



### Actions
0 : south,
1 : north,
2 : east,
3 : west,
4 : picking up,
5 : dropping off

### States

5 * 5 * 5 * 4

In [4]:
state = env.encode(4,1,2,3)

In [5]:
print("State: ", state)
env.s = state
env.render()

State:  431
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m|[43m [0m: |[35mB[0m: |
+---------+



In [6]:
env.P[431]

{0: [(1.0, 431, -1, False)],
 1: [(1.0, 331, -1, False)],
 2: [(1.0, 451, -1, False)],
 3: [(1.0, 431, -1, False)],
 4: [(1.0, 431, -10, False)],
 5: [(1.0, 431, -10, False)]}

In [7]:
# env.reset()
env.render()
print("State: ", env.s)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m|[43m [0m: |[35mB[0m: |
+---------+

State:  431


In [8]:
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(6)
State Space Discrete(500)


In [9]:
epochs = 0
env.s = 114

done = False
penalties, reward = 0,0
frames = []

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    
    if reward == -10:
        penalties += 1
    
    frames.append({'frame':env.render(mode = 'ansi'),'state':state,'action': action,'reward':reward})
    
    epochs += 1
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 2092
Penalties incurred: 655


In [10]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait = True)
        print(frame['frame'])
        print(f"Timestep :  {i+1}")
        print(f"State : {frame['state']}")
        print(f"Action : {frame['action']}")
        print(f"Reward : {frame['reward']}")
        
#         sleep(.1)


In [11]:
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep :  2092
State : 410
Action : 5
Reward : 20


In [12]:
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)


In [13]:
env.s

410

In [14]:
action = env.action_space.sample()
#state, reward, done, info = env.step(action)

In [15]:
action

0

In [16]:
#action = env.action_space.sample()
state, reward, done, info = env.step(action)

In [17]:
state

410

In [18]:
env.s


410

# Reinforcement Learning

In [19]:
import random
import numpy as np

In [20]:
# from playsound import playsound
# import os
import time

In [21]:
q_table = np.zeros([env.observation_space.n,env.action_space.n])

In [26]:
%%time
t = time.time()
"""Training the model"""

from IPython.display import clear_output

# Hyperparam
alpha = .1
gamma = .6
epsilon = .1

for i in range(100000):
    state = env.reset()
    
    epochs, penalties, reward = 0,0,0
    done = False
    
    while not done:
#         env.render()
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
            
        next_state, reward, done, info = env.step(action)
        
#         if state!=next_state:
        old_qvalue = q_table[state][action]
        new_value = (1-alpha)*old_qvalue + alpha*(reward + gamma*np.max(q_table[next_state]))
        q_table[state][action]=new_value

        if reward == -10:
            penalties += 1

        state = next_state
#         else:
#             q_table[state][action] -=1000
        epochs += 1
#         clear_output(wait=True)
        
    if i%100 == 0:
#         clear_output(wait = True)
        print(f"Episode: {i}")
#         os.system("play ./herewego.wav")
        print(time.time()-t)
#         sleep(.5)
        

print("Training Finished")
# os.system("play ./herewego.wav")

Episode: 0
0.0011289119720458984
Episode: 100
0.09146595001220703
Episode: 200
0.13887572288513184
Episode: 300
0.19378662109375
Episode: 400
0.24336743354797363
Episode: 500
0.28818345069885254
Episode: 600
0.3398282527923584
Episode: 700
0.3803069591522217
Episode: 800
0.4387552738189697
Episode: 900
0.4747920036315918
Episode: 1000
0.5230007171630859
Episode: 1100
0.5812709331512451
Episode: 1200
0.6405322551727295
Episode: 1300
0.6957046985626221
Episode: 1400
0.7514665126800537
Episode: 1500
0.8068349361419678
Episode: 1600
0.854292631149292
Episode: 1700
0.9183928966522217
Episode: 1800
0.9751088619232178
Episode: 1900
1.0132529735565186
Episode: 2000
1.0677270889282227
Episode: 2100
1.1236190795898438
Episode: 2200
1.1745469570159912
Episode: 2300
1.2192227840423584
Episode: 2400
1.260934829711914
Episode: 2500
1.3147292137145996
Episode: 2600
1.3671824932098389
Episode: 2700
1.4181110858917236
Episode: 2800
1.4744312763214111
Episode: 2900
1.5315465927124023
Episode: 3000
1.589

Episode: 25700
13.068373918533325
Episode: 25800
13.147673845291138
Episode: 25900
13.221060037612915
Episode: 26000
13.281158447265625
Episode: 26100
13.336726903915405
Episode: 26200
13.383452892303467
Episode: 26300
13.433730363845825
Episode: 26400
13.485979557037354
Episode: 26500
13.552465438842773
Episode: 26600
13.622461080551147
Episode: 26700
13.676159381866455
Episode: 26800
13.725948572158813
Episode: 26900
13.782257795333862
Episode: 27000
13.831435680389404
Episode: 27100
13.884799003601074
Episode: 27200
13.939462661743164
Episode: 27300
13.991119861602783
Episode: 27400
14.043752193450928
Episode: 27500
14.126748085021973
Episode: 27600
14.209956169128418
Episode: 27700
14.274141073226929
Episode: 27800
14.32311224937439
Episode: 27900
14.372957944869995
Episode: 28000
14.431387901306152
Episode: 28100
14.48432731628418
Episode: 28200
14.541827917098999
Episode: 28300
14.598583698272705
Episode: 28400
14.646730422973633
Episode: 28500
14.69562029838562
Episode: 28600
14

Episode: 50700
25.66719341278076
Episode: 50800
25.717385292053223
Episode: 50900
25.77010178565979
Episode: 51000
25.827741146087646
Episode: 51100
25.870851755142212
Episode: 51200
25.920675039291382
Episode: 51300
25.95810842514038
Episode: 51400
26.000146627426147
Episode: 51500
26.057992935180664
Episode: 51600
26.119763612747192
Episode: 51700
26.17358708381653
Episode: 51800
26.218220472335815
Episode: 51900
26.25658130645752
Episode: 52000
26.305662155151367
Episode: 52100
26.356373071670532
Episode: 52200
26.40565013885498
Episode: 52300
26.456655263900757
Episode: 52400
26.506543397903442
Episode: 52500
26.561031579971313
Episode: 52600
26.618958711624146
Episode: 52700
26.66911482810974
Episode: 52800
26.729119300842285
Episode: 52900
26.778780698776245
Episode: 53000
26.828882694244385
Episode: 53100
26.88124179840088
Episode: 53200
26.92807388305664
Episode: 53300
26.981699228286743
Episode: 53400
27.03909158706665
Episode: 53500
27.090234518051147
Episode: 53600
27.140141

Episode: 75500
38.022852659225464
Episode: 75600
38.08956980705261
Episode: 75700
38.14361548423767
Episode: 75800
38.20061492919922
Episode: 75900
38.24247717857361
Episode: 76000
38.27790117263794
Episode: 76100
38.32525062561035
Episode: 76200
38.36595129966736
Episode: 76300
38.4133415222168
Episode: 76400
38.46111488342285
Episode: 76500
38.508180379867554
Episode: 76600
38.55704617500305
Episode: 76700
38.605732917785645
Episode: 76800
38.65469431877136
Episode: 76900
38.70319128036499
Episode: 77000
38.75103569030762
Episode: 77100
38.79952001571655
Episode: 77200
38.8645920753479
Episode: 77300
38.922876596450806
Episode: 77400
38.973984718322754
Episode: 77500
39.02973031997681
Episode: 77600
39.08144998550415
Episode: 77700
39.13091802597046
Episode: 77800
39.18396210670471
Episode: 77900
39.244166135787964
Episode: 78000
39.301575899124146
Episode: 78100
39.353060245513916
Episode: 78200
39.390546798706055
Episode: 78300
39.424880027770996
Episode: 78400
39.478068113327026
E

0

In [27]:
q_table[4]

array([   -2.4961915 , -1745.88959936,    -2.4961915 ,  -849.78604219,
        -120.67446919, -1027.97348676])

# Evaluation

In [61]:
epoch, penalties = 0.0,0
n = 1000

for i in range(n):
    env.reset()
    done = False
    state = env.s
    
    while not done:
        action = np.argmax(q_table[state])
        state,reward,done,info = env.step(action)
        
        epoch += 1
        if reward == -10:
            penalties += 1
        
print(f"Total Penalties: {penalties}")
print(f"Average Epoch: {epoch/n}")


Total Penalties: 0
Average Epoch: 12.953


In [62]:
state

410