In [1]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np 
import time, math, random
from typing import Tuple

# import gym 
import gym

In [2]:
env = gym.make('CartPole-v1')

In [3]:
n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
upper_bounds = [ env.observation_space.high[2], math.radians(50) ]

def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
    """Convert continues state intro a discrete state"""
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [4]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 12, 2)

In [5]:
def policy( state : tuple ):
    """Choosing action based on epsilon-greedy policy"""
    return np.argmax(Q_table[state])

In [6]:
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
    """Temperal diffrence for updating Q-value of state-action pair"""
    future_optimal_value = np.max(Q_table[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [7]:
# Adaptive learning of Learning Rate
def learning_rate(n : int , min_rate=0.01 ) -> float  :
    """Decaying learning rate"""
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))

In [8]:
def exploration_rate(n : int, min_rate= 0.1 ) -> float :
    """Decaying exploration rate"""
    return max(min_rate, min(1, 1.0 - math.log10((n  + 1) / 25)))

In [9]:
n_episodes = 500 
for e in range(n_episodes):
    
    # Siscretize state into buckets
    current_state, done = discretizer(*env.reset()), False
    
    for time in range(5000):
        
        # policy action 
        action = policy(current_state) # exploit
        
        # insert random action
        if np.random.random() < exploration_rate(e) : 
            action = env.action_space.sample() # explore 
         
        # increment enviroment
        obs, reward, done, _ = env.step(action)
        new_state = discretizer(*obs)
        
        # Update Q-Table
        lr = learning_rate(e)
        learnt_value = new_Q_value(reward , new_state )
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1 - lr) * old_value + lr * learnt_value
        
        current_state = new_state
        
        # Render the cartpole environment
        env.render()

        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, n_episodes, time, exploration_rate(e)))
            break


episode: 0/500, score: 12, e: 1
episode: 1/500, score: 19, e: 1
episode: 2/500, score: 13, e: 1
episode: 3/500, score: 18, e: 1
episode: 4/500, score: 38, e: 1
episode: 5/500, score: 13, e: 1
episode: 6/500, score: 17, e: 1
episode: 7/500, score: 28, e: 1
episode: 8/500, score: 20, e: 1
episode: 9/500, score: 7, e: 1
episode: 10/500, score: 32, e: 1
episode: 11/500, score: 9, e: 1
episode: 12/500, score: 10, e: 1
episode: 13/500, score: 71, e: 1
episode: 14/500, score: 15, e: 1
episode: 15/500, score: 12, e: 1
episode: 16/500, score: 12, e: 1
episode: 17/500, score: 28, e: 1
episode: 18/500, score: 19, e: 1
episode: 19/500, score: 28, e: 1
episode: 20/500, score: 15, e: 1
episode: 21/500, score: 13, e: 1
episode: 22/500, score: 27, e: 1
episode: 23/500, score: 14, e: 1
episode: 24/500, score: 23, e: 1
episode: 25/500, score: 25, e: 0.9829666607012196
episode: 26/500, score: 27, e: 0.9665762445130502
episode: 27/500, score: 14, e: 0.9507819773298184
episode: 28/500, score: 19, e: 0.9355

KeyboardInterrupt: 