In [1]:
import sys
import random
import numpy as np

import gym
from randomwalk import RandomWalkEnv

## Explore The Environment

In [2]:
env = RandomWalkEnv()

In [3]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print("State Space {}".format(env.observation_space))
print("Action Space {}".format(env.action_space))

Environment Display:

[ABC[43mD[0mEFG]
State Space Discrete(9)
Action Space Discrete(2)


## Train The Agent

In [4]:
num_episodes = 100000
num_steps_per_episode = 200

epsilon = 0.3
alpha = 0.2
gamma = 0.6

In [5]:
Q_table = np.zeros([env.observation_space.n, env.action_space.n])

for i_episode in range(1, num_episodes+1):
    
    # monitor progress
    if i_episode % 100 == 0:
        print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
        sys.stdout.flush()
        
    # get initial state by restarting the environment
    state = env.reset()
    
    total_reward = 0
    done = False
    for j_step in range(1, num_steps_per_episode+1):
        
        # apply epsilon-greedy policy
        if random.uniform(0, 1) < epsilon:
            action = np.random.choice(env.action_space.n) # explore action space
        else:
            action = np.argmax(Q_table[state]) # exploit learned values
            
        next_state, reward, done, _ = env.step(action)
        
        # apply Q-learning update rule
        current_value = Q_table[state][action]
        Qsa_next = np.max(Q_table[next_state])
        
        new_value = (1 - alpha) * current_value + (alpha * (reward + gamma * Qsa_next))
        Q_table[state][action] = new_value
        
        state = next_state
        
        if done:
            break
print("\nTraining finished.")

Episode 100000/100000
Training finished.


## Evaluate The Agent

In [6]:
num_episodes = 100

In [7]:
collected_rewards = []

for i_episode in range(1, num_episodes+1):
    
    print('Starting new episode...')
    state = env.reset()
    env.render()
    
    total_reward = 0
    done = False
    for j_step in range(1, num_steps_per_episode+1):
        
        action = np.argmax(Q_table[state])
        next_state, reward, done, _ = env.step(action)
        
        env.render()
        total_reward += reward
        
        if done:
            break
    collected_rewards.append(total_reward)
    print(f'Episode {i_episode}, Total rewards: {total_reward}, After steps: {j_step}')

Starting new episode...

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDEFG[42m][0m
Episode 1, Total rewards: 1.0, After steps: 12
Starting new episode...

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABC

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[41m[[0mABCDEFG]
Episode 38, Total rewards: 0.0, After steps: 26
Starting new episode...

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[41m[[0mABCDEFG]
Episode 39, Total rewards: 0.0, After steps: 8
Starting new episode...

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDEFG[42m][0m
Episode 40, Total rewards: 1.0, After steps: 10
Starting new episode...

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD


[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[41m[[0mABCDEFG]
Episode 58, Total rewards: 0.0, After steps: 12
Starting new episode...

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDEFG[42m][0m
Episode 59, Total rewards: 1.0, After steps: 18
Starting new episode...

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[41m[[0mABCDEFG]
Episode 60, Tota

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[41m[[0mABCDEFG]
Episode 81, Total rewards: 0.0, After steps: 16
Starting new episode...

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCDEF[43mG[0m]

[ABCDEFG[42m][0m
Episode 82, Total rewards: 1.0, After steps: 4
Starting new episode...

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABCDE[43mF[0mG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG]

[A[43mB[0mCDEFG]

[AB[43mC[0mDEFG]

[ABC[43mD[0mEFG]

[ABCD[43mE[0mFG]

[ABC[43mD[0mEFG]

[AB[43mC[0mDEFG]

[A[43mB[0mCDEFG]

[[43mA[0mBCDEFG

In [8]:
print(f'Wins {(np.array(collected_rewards) == 1.0).sum()} of {num_episodes} episodes (Average score: {sum(collected_rewards) / num_episodes}).')

Wins 51 of 100 episodes (Average score: 0.51).


---