In [1]:
import sys
import random
import numpy as np

import gym
from randomwalk import RandomWalkEnv

## Explore The Environment

In [2]:
env = RandomWalkEnv()

In [3]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print("State Space {}".format(env.observation_space))
print("Action Space {}".format(env.action_space))

Environment Display:

[ABC[43mD[0mEFG]
State Space Discrete(9)
Action Space Discrete(2)


## Train The Agent

In [None]:
num_episodes = 100000
num_steps_per_episode = 200

epsilon = 0.3
alpha = 0.2
gamma = 0.6

In [None]:
Q_table = np.zeros([env.observation_space.n, env.action_space.n])

for i_episode in range(1, num_episodes+1):
    
    # monitor progress
    if i_episode % 100 == 0:
        print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
        sys.stdout.flush()
        
    # get initial state by restarting the environment
    state = env.reset()
    
    total_reward = 0
    done = False
    for j_step in range(1, num_steps_per_episode+1):
        
        # apply epsilon-greedy policy
        if random.uniform(0, 1) < epsilon:
            action = np.random.choice(env.action_space.n) # explore action space
        else:
            action = np.argmax(Q_table[state]) # exploit learned values
            
        next_state, reward, done, _ = env.step(action)
        
        # apply Q-learning update rule
        current_value = Q_table[state][action]
        Qsa_next = np.max(Q_table[next_state])
        
        new_value = (1 - alpha) * current_value + (alpha * (reward + gamma * Qsa_next))
        Q_table[state][action] = new_value
        
        state = next_state
        
        if done:
            break
print("\nTraining finished.")

Episode 80700/100000

## Evaluate The Agent

In [None]:
num_episodes = 100

In [None]:
collected_rewards = []

for i_episode in range(1, num_episodes+1):
    
    print('Starting new episode...')
    state = env.reset()
    env.render()
    
    total_reward = 0
    done = False
    for j_step in range(1, num_steps_per_episode+1):
        
        action = np.argmax(Q_table[state])
        next_state, reward, done, _ = env.step(action)
        
        env.render()
        total_reward += reward
        
        if done:
            break
    collected_rewards.append(total_reward)
    print(f'Episode {i_episode}, Total rewards: {total_reward}, After steps: {j_step}')

In [None]:
print(f'Wins {(np.array(collected_rewards) == 1.0).sum()} of {num_episodes} episodes (Average score: {sum(collected_rewards) / num_episodes}).')

---