### Inspired By
https://github.com/RJBrooker/Q-learning-demo-Cartpole-V1/blob/master/cartpole.ipynb

In [2]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np 
import time, math, random
from typing import Tuple
import tqdm
import gymnasium as gym

In [3]:
env = gym.make('CartPole-v1', render_mode='human')

In [4]:
n_bins = (20, 20, 20)
lower_bounds = [env.observation_space.low[0], env.observation_space.low[2], -math.radians(50)]
upper_bounds = [env.observation_space.high[0], env.observation_space.high[2], math.radians(50)]

def discretizer(location, _, angle, pole_velocity) -> Tuple[int, ...]:
    """Convert continuous state into a discrete state"""
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int, est.transform([[location, angle, pole_velocity]])[0]))


Initialise the Q value table with zeros. 

In [5]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(20, 20, 20, 2)

In [6]:
def policy( state : tuple ):
    """Choosing action based on epsilon-greedy policy"""
    return np.argmax(Q_table[state])


Update function 

In [7]:
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=0.9 ) -> float:
    """Temperal diffrence for updating Q-value of state-action pair"""
    future_optimal_value = np.max(Q_table[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

Decaying learning rate

In [8]:
# Adaptive learning of Learning Rate
def learning_rate(n : int , min_rate=0.01 ) -> float  :
    """Decaying learning rate"""
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))

Decaying exploration rate

In [9]:
def exploration_rate(n : int, min_rate= 0.01 ) -> float :
    """Decaying exploration rate"""
    # return max(min_rate, min(1, 1.0 - math.log10((n  + 1) / 25)))
    return max(min_rate, min(1, 1/np.log(n+2)))

## Training

In [10]:

n_episodes = 1000
for e in tqdm.tqdm(range(n_episodes)):
    
    current_state, done = discretizer(*env.reset()[0]), False
    steps = 0
    while done == False:
        steps = steps + 1
        if steps > 1000:
            break
        
        # policy action 
        action = policy(current_state) # exploit
        
        # insert random action
        if np.random.random() < exploration_rate(e) : 
            action = env.action_space.sample() # explore 
         
        # increment enviroment
        obs, reward, done, _, extra = env.step(action)
        new_state = discretizer(*obs)
        
        # Update Q-Table
        lr = learning_rate(e)
        learnt_value = new_Q_value(reward, new_state)
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value
        
        current_state = new_state
        
        # Render the cartpole environment
        env.render()
            
Q_table
            
            

  0%|          | 1/1000 [00:04<1:13:49,  4.43s/it]

36


  0%|          | 2/1000 [00:05<43:54,  2.64s/it]  

33


  0%|          | 3/1000 [00:07<33:04,  1.99s/it]

29


  0%|          | 4/1000 [00:08<29:33,  1.78s/it]

35


  0%|          | 5/1000 [00:08<21:44,  1.31s/it]

11


  1%|          | 6/1000 [00:10<21:26,  1.29s/it]

30


  1%|          | 7/1000 [00:10<17:00,  1.03s/it]

11


  1%|          | 8/1000 [00:11<18:10,  1.10s/it]

30


  1%|          | 9/1000 [00:12<16:37,  1.01s/it]

19


  1%|          | 10/1000 [00:13<14:06,  1.17it/s]

12


  1%|          | 11/1000 [00:14<13:37,  1.21it/s]

18


  1%|          | 12/1000 [00:14<11:51,  1.39it/s]

11


  1%|▏         | 13/1000 [00:15<15:22,  1.07it/s]

33


  1%|▏         | 14/1000 [00:16<14:31,  1.13it/s]

18


  2%|▏         | 15/1000 [00:17<14:18,  1.15it/s]

20


  2%|▏         | 16/1000 [00:18<15:37,  1.05it/s]

27


  2%|▏         | 17/1000 [00:19<13:50,  1.18it/s]

14


  2%|▏         | 18/1000 [00:19<12:36,  1.30it/s]

14


  2%|▏         | 19/1000 [00:20<12:34,  1.30it/s]

18


  2%|▏         | 20/1000 [00:22<15:32,  1.05it/s]

32


  2%|▏         | 21/1000 [00:22<14:13,  1.15it/s]

16


  2%|▏         | 22/1000 [00:23<15:30,  1.05it/s]

27


  2%|▏         | 23/1000 [00:25<20:30,  1.26s/it]

45


  2%|▏         | 24/1000 [00:26<16:28,  1.01s/it]

10


  2%|▎         | 25/1000 [00:26<14:26,  1.13it/s]

14


  3%|▎         | 26/1000 [00:27<13:48,  1.18it/s]

18


  3%|▎         | 27/1000 [00:28<14:35,  1.11it/s]

24


  3%|▎         | 28/1000 [00:29<13:20,  1.21it/s]

14


  3%|▎         | 29/1000 [00:30<16:14,  1.00s/it]

34


  3%|▎         | 30/1000 [00:31<15:51,  1.02it/s]

22


  3%|▎         | 31/1000 [00:32<14:58,  1.08it/s]

19


  3%|▎         | 32/1000 [00:33<13:28,  1.20it/s]

14


  3%|▎         | 33/1000 [00:33<12:55,  1.25it/s]

17


  3%|▎         | 34/1000 [00:34<14:43,  1.09it/s]

28


  4%|▎         | 35/1000 [00:35<13:24,  1.20it/s]

15


  4%|▎         | 36/1000 [00:36<12:09,  1.32it/s]

12


  4%|▎         | 37/1000 [00:37<14:20,  1.12it/s]

29


  4%|▍         | 38/1000 [00:37<12:32,  1.28it/s]

12


  4%|▍         | 39/1000 [00:38<12:26,  1.29it/s]

18


  4%|▍         | 40/1000 [00:39<14:07,  1.13it/s]

27


  4%|▍         | 41/1000 [00:40<13:55,  1.15it/s]

20


  4%|▍         | 42/1000 [00:41<15:46,  1.01it/s]

30


  4%|▍         | 43/1000 [00:42<14:06,  1.13it/s]

15


  4%|▍         | 44/1000 [00:43<12:55,  1.23it/s]

14


  4%|▍         | 45/1000 [00:43<12:39,  1.26it/s]

18


  5%|▍         | 46/1000 [00:44<13:02,  1.22it/s]

21


  5%|▍         | 47/1000 [00:45<11:58,  1.33it/s]

14


  5%|▍         | 48/1000 [00:46<13:37,  1.16it/s]

26


  5%|▍         | 49/1000 [00:48<17:32,  1.11s/it]

40


  5%|▌         | 50/1000 [00:49<18:08,  1.15s/it]

29


  5%|▌         | 51/1000 [00:50<18:03,  1.14s/it]

27


  5%|▌         | 52/1000 [00:51<15:04,  1.05it/s]

12


  5%|▌         | 53/1000 [00:51<14:32,  1.09it/s]

20


  5%|▌         | 54/1000 [00:52<14:09,  1.11it/s]

20


  6%|▌         | 55/1000 [00:53<15:28,  1.02it/s]

28


  6%|▌         | 56/1000 [00:54<12:28,  1.26it/s]

8


  6%|▌         | 57/1000 [00:54<10:50,  1.45it/s]

10


  6%|▌         | 58/1000 [00:55<10:47,  1.45it/s]

16


  6%|▌         | 59/1000 [00:56<10:22,  1.51it/s]

14


  6%|▌         | 60/1000 [00:57<11:47,  1.33it/s]

23


  6%|▌         | 61/1000 [00:58<13:22,  1.17it/s]

26


  6%|▌         | 62/1000 [00:58<11:37,  1.35it/s]

11


  6%|▋         | 63/1000 [00:59<10:31,  1.48it/s]

12


  6%|▋         | 64/1000 [00:59<10:22,  1.50it/s]

15


  6%|▋         | 65/1000 [01:00<11:10,  1.39it/s]

19


  7%|▋         | 66/1000 [01:00<09:26,  1.65it/s]

8


  7%|▋         | 67/1000 [01:01<08:38,  1.80it/s]

10


  7%|▋         | 68/1000 [01:01<08:15,  1.88it/s]

11


  7%|▋         | 69/1000 [01:02<10:04,  1.54it/s]

22


  7%|▋         | 70/1000 [01:03<09:49,  1.58it/s]

14


  7%|▋         | 71/1000 [01:04<11:09,  1.39it/s]

22


  7%|▋         | 72/1000 [01:04<10:55,  1.42it/s]

15


  7%|▋         | 73/1000 [01:05<11:33,  1.34it/s]

20


  7%|▋         | 74/1000 [01:06<11:14,  1.37it/s]

16


  8%|▊         | 75/1000 [01:07<10:14,  1.51it/s]

12


  8%|▊         | 76/1000 [01:07<09:43,  1.58it/s]

13


  8%|▊         | 77/1000 [01:08<11:15,  1.37it/s]

23


  8%|▊         | 78/1000 [01:09<13:52,  1.11it/s]

31


  8%|▊         | 79/1000 [01:10<12:50,  1.20it/s]

16


  8%|▊         | 80/1000 [01:11<14:48,  1.04it/s]

30


  8%|▊         | 81/1000 [01:12<15:00,  1.02it/s]

24


  8%|▊         | 82/1000 [01:13<13:36,  1.12it/s]

16


  8%|▊         | 83/1000 [01:14<12:25,  1.23it/s]

15


  8%|▊         | 84/1000 [01:14<11:36,  1.31it/s]

15


  8%|▊         | 85/1000 [01:15<11:58,  1.27it/s]

20


  9%|▊         | 86/1000 [01:16<10:09,  1.50it/s]

9


  9%|▊         | 87/1000 [01:17<12:07,  1.25it/s]

26


  9%|▉         | 88/1000 [01:17<10:38,  1.43it/s]

11


  9%|▉         | 89/1000 [01:18<09:59,  1.52it/s]

13


  9%|▉         | 90/1000 [01:18<09:08,  1.66it/s]

11


  9%|▉         | 91/1000 [01:19<09:18,  1.63it/s]

15


  9%|▉         | 92/1000 [01:20<10:21,  1.46it/s]

20


  9%|▉         | 93/1000 [01:21<11:59,  1.26it/s]

25


  9%|▉         | 94/1000 [01:22<12:39,  1.19it/s]

22


 10%|▉         | 95/1000 [01:22<11:34,  1.30it/s]

14


 10%|▉         | 96/1000 [01:23<12:16,  1.23it/s]

22


 10%|▉         | 97/1000 [01:24<11:03,  1.36it/s]

13


 10%|▉         | 98/1000 [01:24<11:11,  1.34it/s]

18


 10%|▉         | 99/1000 [01:26<15:19,  1.02s/it]

40


 10%|█         | 100/1000 [01:27<13:57,  1.08it/s]

17


 10%|█         | 101/1000 [01:28<13:22,  1.12it/s]

19


 10%|█         | 102/1000 [01:28<11:58,  1.25it/s]

13


 10%|█         | 103/1000 [01:29<10:30,  1.42it/s]

11


 10%|█         | 104/1000 [01:29<09:50,  1.52it/s]

13


 10%|█         | 105/1000 [01:30<08:38,  1.73it/s]

9


 11%|█         | 106/1000 [01:31<11:28,  1.30it/s]

29


 11%|█         | 107/1000 [01:31<10:51,  1.37it/s]

15


 11%|█         | 108/1000 [01:33<12:16,  1.21it/s]

25


 11%|█         | 109/1000 [01:33<12:00,  1.24it/s]

18


 11%|█         | 110/1000 [01:34<10:52,  1.36it/s]

13


 11%|█         | 111/1000 [01:35<12:05,  1.23it/s]

24


 11%|█         | 112/1000 [01:36<13:41,  1.08it/s]

28


 11%|█▏        | 113/1000 [01:37<14:13,  1.04it/s]

25


 11%|█▏        | 114/1000 [01:38<13:40,  1.08it/s]

20


 12%|█▏        | 115/1000 [01:39<12:24,  1.19it/s]

15


 12%|█▏        | 116/1000 [01:40<14:18,  1.03it/s]

30


 12%|█▏        | 117/1000 [01:41<15:00,  1.02s/it]

27


 12%|█▏        | 118/1000 [01:42<15:27,  1.05s/it]

27


 12%|█▏        | 119/1000 [01:43<15:25,  1.05s/it]

25


 12%|█▏        | 120/1000 [01:44<14:40,  1.00s/it]

21


 12%|█▏        | 121/1000 [01:45<16:17,  1.11s/it]

33


 12%|█▏        | 122/1000 [01:46<14:37,  1.00it/s]

17


 12%|█▏        | 123/1000 [01:47<15:12,  1.04s/it]

27


 12%|█▏        | 124/1000 [01:48<14:52,  1.02s/it]

23


 12%|█▎        | 125/1000 [01:49<12:41,  1.15it/s]

12


 13%|█▎        | 126/1000 [01:50<13:27,  1.08it/s]

25


 13%|█▎        | 127/1000 [01:51<13:59,  1.04it/s]

25


 13%|█▎        | 128/1000 [01:52<15:05,  1.04s/it]

29


 13%|█▎        | 129/1000 [01:53<14:35,  1.01s/it]

22


 13%|█▎        | 130/1000 [01:54<13:19,  1.09it/s]

17


 13%|█▎        | 131/1000 [01:55<14:24,  1.00it/s]

28


 13%|█▎        | 132/1000 [01:56<14:46,  1.02s/it]

26


 13%|█▎        | 133/1000 [01:57<15:05,  1.04s/it]

26


 13%|█▎        | 134/1000 [01:58<14:44,  1.02s/it]

23


 14%|█▎        | 135/1000 [01:59<14:19,  1.01it/s]

22


 14%|█▎        | 136/1000 [02:00<15:57,  1.11s/it]

33


 14%|█▎        | 137/1000 [02:01<13:32,  1.06it/s]

12


 14%|█▍        | 138/1000 [02:02<16:07,  1.12s/it]

37


 14%|█▍        | 139/1000 [02:03<14:01,  1.02it/s]

15


 14%|█▍        | 140/1000 [02:04<13:26,  1.07it/s]

20


 14%|█▍        | 141/1000 [02:05<13:00,  1.10it/s]

20


 14%|█▍        | 142/1000 [02:05<11:17,  1.27it/s]

12


 14%|█▍        | 143/1000 [02:07<13:28,  1.06it/s]

31


 14%|█▍        | 144/1000 [02:08<15:09,  1.06s/it]

32


 14%|█▍        | 145/1000 [02:09<16:30,  1.16s/it]

33


 15%|█▍        | 146/1000 [02:10<14:36,  1.03s/it]

17


 15%|█▍        | 147/1000 [02:11<14:40,  1.03s/it]

25


 15%|█▍        | 148/1000 [02:12<13:40,  1.04it/s]

19


 15%|█▍        | 149/1000 [02:12<11:12,  1.27it/s]

9


 15%|█▌        | 150/1000 [02:13<11:25,  1.24it/s]

20


 15%|█▌        | 151/1000 [02:14<12:21,  1.14it/s]

24


 15%|█▌        | 152/1000 [02:15<12:34,  1.12it/s]

22


 15%|█▌        | 153/1000 [02:16<11:30,  1.23it/s]

15


 15%|█▌        | 154/1000 [02:16<10:13,  1.38it/s]

12


 16%|█▌        | 155/1000 [02:17<11:24,  1.23it/s]

24


 16%|█▌        | 156/1000 [02:18<11:31,  1.22it/s]

20


 16%|█▌        | 157/1000 [02:18<09:41,  1.45it/s]

9


 16%|█▌        | 158/1000 [02:19<09:42,  1.44it/s]

16


 16%|█▌        | 159/1000 [02:20<10:40,  1.31it/s]

22


 16%|█▌        | 160/1000 [02:20<09:06,  1.54it/s]

9


 16%|█▌        | 161/1000 [02:21<08:42,  1.61it/s]

13


 16%|█▌        | 162/1000 [02:22<11:11,  1.25it/s]

29


 16%|█▋        | 163/1000 [02:23<10:50,  1.29it/s]

17


 16%|█▋        | 164/1000 [02:24<10:56,  1.27it/s]

19


 16%|█▋        | 165/1000 [02:25<11:52,  1.17it/s]

24


 17%|█▋        | 166/1000 [02:27<16:38,  1.20s/it]

48


 17%|█▋        | 167/1000 [02:28<15:49,  1.14s/it]

24


 17%|█▋        | 168/1000 [02:29<17:17,  1.25s/it]

36


 17%|█▋        | 169/1000 [02:31<17:08,  1.24s/it]

29


 17%|█▋        | 170/1000 [02:33<20:45,  1.50s/it]

51


 17%|█▋        | 171/1000 [02:34<18:31,  1.34s/it]

23


 17%|█▋        | 172/1000 [02:35<16:48,  1.22s/it]

22


 17%|█▋        | 173/1000 [02:35<15:04,  1.09s/it]

19


 17%|█▋        | 174/1000 [02:36<13:10,  1.04it/s]

15


 18%|█▊        | 175/1000 [02:37<14:25,  1.05s/it]

30


 18%|█▊        | 176/1000 [02:38<14:13,  1.04s/it]

24


 18%|█▊        | 177/1000 [02:39<13:44,  1.00s/it]

22


 18%|█▊        | 178/1000 [02:40<14:57,  1.09s/it]

31


 18%|█▊        | 179/1000 [02:41<14:14,  1.04s/it]

22


 18%|█▊        | 180/1000 [02:42<11:43,  1.17it/s]

10


 18%|█▊        | 181/1000 [02:43<11:48,  1.16it/s]

21


 18%|█▊        | 182/1000 [02:44<14:26,  1.06s/it]

35


 18%|█▊        | 183/1000 [02:45<14:23,  1.06s/it]

25


 18%|█▊        | 184/1000 [02:46<12:59,  1.05it/s]

17


 18%|█▊        | 185/1000 [02:47<13:20,  1.02it/s]

25


 19%|█▊        | 186/1000 [02:48<14:38,  1.08s/it]

31


 19%|█▊        | 187/1000 [02:49<14:18,  1.06s/it]

24


 19%|█▉        | 188/1000 [02:50<13:06,  1.03it/s]

18


 19%|█▉        | 189/1000 [02:51<12:40,  1.07it/s]

20


 19%|█▉        | 190/1000 [02:52<12:28,  1.08it/s]

21


 19%|█▉        | 191/1000 [02:53<12:58,  1.04it/s]

25


 19%|█▉        | 192/1000 [02:54<11:32,  1.17it/s]

14


 19%|█▉        | 193/1000 [02:55<13:17,  1.01it/s]

31


 19%|█▉        | 194/1000 [02:57<17:59,  1.34s/it]

48


 20%|█▉        | 195/1000 [02:59<20:54,  1.56s/it]

49


 20%|█▉        | 196/1000 [02:59<16:01,  1.20s/it]

8


 20%|█▉        | 197/1000 [03:00<14:49,  1.11s/it]

20


 20%|█▉        | 198/1000 [03:01<13:20,  1.00it/s]

17


 20%|█▉        | 199/1000 [03:02<14:07,  1.06s/it]

26


 20%|██        | 200/1000 [03:03<13:35,  1.02s/it]

21


 20%|██        | 201/1000 [03:05<14:55,  1.12s/it]

32


 20%|██        | 202/1000 [03:05<14:16,  1.07s/it]

22


 20%|██        | 203/1000 [03:06<12:56,  1.03it/s]

17


 20%|██        | 204/1000 [03:07<12:16,  1.08it/s]

19


 20%|██        | 205/1000 [03:08<11:48,  1.12it/s]

19


 21%|██        | 206/1000 [03:09<11:32,  1.15it/s]

19


 21%|██        | 207/1000 [03:09<11:04,  1.19it/s]

18


 21%|██        | 208/1000 [03:10<10:35,  1.25it/s]

17


 21%|██        | 209/1000 [03:12<13:11,  1.00s/it]

35


 21%|██        | 210/1000 [03:13<12:45,  1.03it/s]

21


 21%|██        | 211/1000 [03:13<10:27,  1.26it/s]

9


 21%|██        | 212/1000 [03:14<11:24,  1.15it/s]

24


 21%|██▏       | 213/1000 [03:15<13:24,  1.02s/it]

33


 21%|██▏       | 214/1000 [03:16<12:59,  1.01it/s]

22


 22%|██▏       | 215/1000 [03:18<16:35,  1.27s/it]

46


 22%|██▏       | 216/1000 [03:19<15:12,  1.16s/it]

22


 22%|██▏       | 217/1000 [03:21<18:56,  1.45s/it]

51


 22%|██▏       | 218/1000 [03:22<16:52,  1.29s/it]

22


 22%|██▏       | 219/1000 [03:23<17:01,  1.31s/it]

31


 22%|██▏       | 220/1000 [03:25<16:57,  1.30s/it]

31


 22%|██▏       | 221/1000 [03:26<15:46,  1.21s/it]

24


 22%|██▏       | 222/1000 [03:27<16:43,  1.29s/it]

35


 22%|██▏       | 223/1000 [03:28<14:00,  1.08s/it]

14


 22%|██▏       | 224/1000 [03:29<12:54,  1.00it/s]

19


 22%|██▎       | 225/1000 [03:30<13:55,  1.08s/it]

30


 23%|██▎       | 226/1000 [03:31<12:03,  1.07it/s]

14


 23%|██▎       | 227/1000 [03:31<11:50,  1.09it/s]

21


 23%|██▎       | 228/1000 [03:32<12:09,  1.06it/s]

24


 23%|██▎       | 229/1000 [03:33<11:36,  1.11it/s]

19


 23%|██▎       | 230/1000 [03:34<11:18,  1.13it/s]

18


 23%|██▎       | 231/1000 [03:35<12:33,  1.02it/s]

29


 23%|██▎       | 232/1000 [03:36<10:26,  1.23it/s]

10


 23%|██▎       | 233/1000 [03:37<11:01,  1.16it/s]

23


 23%|██▎       | 234/1000 [03:38<13:07,  1.03s/it]

33


 24%|██▎       | 235/1000 [03:39<11:56,  1.07it/s]

17


 24%|██▎       | 236/1000 [03:40<11:34,  1.10it/s]

20


 24%|██▎       | 237/1000 [03:41<12:06,  1.05it/s]

24


 24%|██▍       | 238/1000 [03:42<11:42,  1.08it/s]

20


 24%|██▍       | 239/1000 [03:42<09:50,  1.29it/s]

10


 24%|██▍       | 240/1000 [03:43<10:24,  1.22it/s]

22


 24%|██▍       | 241/1000 [03:44<11:43,  1.08it/s]

28


 24%|██▍       | 242/1000 [03:45<10:55,  1.16it/s]

17


 24%|██▍       | 243/1000 [03:46<10:51,  1.16it/s]

20


In [42]:
current_state, done = discretizer(*env.reset()[0]), False
steps = 0
while done==False:

    steps = steps + 1
    if steps > 1000:
        break
    
    action = policy(current_state) # exploit
    obs, reward, done, _, extra = env.step(action)
    new_state = discretizer(*obs)
    
    learnt_value = new_Q_value(reward, new_state)
    
    current_state = new_state
    
    env.render()
print (steps)

675
