In [1]:
import sys
import random
import numpy as np
from time import sleep
from IPython.display import clear_output

import gym
from randomwalk import RandomWalkEnv

## Explore The Environment

In [2]:
env = RandomWalkEnv(n_states=21)

In [3]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print("State Space {}".format(env.observation_space))
print("Action Space {}".format(env.action_space))

Environment Display:

[ABCDEFGHIJ[43mK[0mLMNOPQRSTU]
State Space Discrete(23)
Action Space Discrete(2)


## Train The Agent

In [4]:
num_episodes = 60000
num_steps_per_episode = 200

epsilon = 0.2
alpha = 0.9
gamma = 0.9

In [5]:
def manipulate_the_rewards(reward):
    
    if reward == 0: reward = -1.0
    elif reward == 1.0: reward = +20.0
    elif reward == -1.0: reward = -20.0
    
    return reward                

In [6]:
Q_table = np.zeros([env.observation_space.n, env.action_space.n])

for i_episode in range(1, num_episodes+1):
    
    # monitor progress
    if i_episode % 100 == 0:
        print('\rEpisode {i_episode}/{num_episodes}', end="")
        sys.stdout.flush()
        
    # get initial state by restarting the environment
    state = env.reset()
    
    total_reward = 0
    done = False
    for j_step in range(1, num_steps_per_episode+1):
        
        # apply epsilon-greedy policy
        if random.uniform(0, 1) < epsilon:
            action = np.random.choice(env.action_space.n) # explore action space
        else:
            action = np.argmax(Q_table[state]) # exploit learned values
            
        next_state, reward, done, _ = env.step(action)
        
        # scratchly manipulate the rewards to force behaviour
        reward = manipulate_the_rewards(reward)
        
        # apply Q-learning update rule
        current_value = Q_table[state][action]
        Qsa_next = np.max(Q_table[next_state])
        
        new_value = (1 - alpha) * current_value + (alpha * (reward + gamma * Qsa_next))
        Q_table[state][action] = new_value
        
        state = next_state
        
        if done:
            break
print("\nTraining finished.")

Episode 60000/60000
Training finished.


## Evaluate The Agent

In [7]:
num_episodes = 100

In [8]:
averaged_steps = []
collected_rewards = []

for i_episode in range(1, num_episodes+1):
    
    print('Starting new episode...')
    state = env.reset()
    env.render()
    
    total_step = 0
    total_reward = 0
    done = False
    
    while not done:
        action = np.argmax(Q_table[state])
        next_state, reward, done, _ = env.step(action)
        
        # render the env
        clear_output(wait=True)
        print(f'Episode: {i_episode}/{num_episodes}, Total step: {total_step+1}')
        print(f'Wins: {(np.array(collected_rewards) == 1.0).sum()} (Average score: {round(sum(collected_rewards)/i_episode, 2)})')
        env.render()
        sleep(.1)
        
        total_step +=1
        total_reward += reward
            
        if done:
            break
            
    averaged_steps.append(total_step)
    collected_rewards.append(total_reward)
    print(f'Episode {i_episode}, Collected rewards: {(np.array(collected_rewards) == 1.0).sum()}, Average steps: {np.average(averaged_steps)}')
    print(f'Evaluation finished.')

Episode: 100/100, Total step: 277
Wins: 48 (Average score: 0.48)

[ABCDEFGHIJKLMNOPQRSTU[42m][0m
Episode 100, Collected rewards: 49, Average steps: 124.24
Evaluation finished.


---