# Train Cartpole Balancing problem  in OpenAI Env using Q-Learning

In [1]:
# Initialize
import gym
import numpy as np
import random

env = gym.make('CartPole-v0')
actions=range(env.action_space.n)

alpha=0.01
gamma=0.90
epsilon=1

episodes = 2000

In [2]:
def get_all_states_as_string():
    states = []
    for i in range(10000):
        states.append(str(i).zfill(4))
    return states

# creates a dictionary that stores q-values for all states and actions corresponding to them
def initialize():
    q = {}
    all_states = get_all_states_as_string()
    for state in all_states:
        q[state] = {}
        for action in range(env.action_space.n):
            q[state][action] = 0
    return q

In [3]:
# creating bins for digitization
bins = np.zeros((4,10))
bins[0] = np.linspace(-4.8, 4.8, 10)
bins[1] = np.linspace(-5, 5, 10)
bins[2] = np.linspace(-0.418, 0.418, 10)
bins[3] = np.linspace(-5, 5, 10)

def digitize(obs):
    state = np.zeros(4)
    for i in range(4):
        state[i] = np.digitize(obs[i], bins[i])
    return state

# converting state of format = [float]*4 to a string 
def doHash(arr):
    return ''.join(str(int(e)) for e in arr)

In [4]:
def max_dict(d):
    max_v = float('-inf')
    for key, val in d.items():
        if val > max_v:
            max_v = val
            max_key = key
    return max_key, max_v

# updating q-value after each iteration
def learn(state, action1, reward, new_state,q):
    maxqnew = max_dict(q[new_state])
    q[state][action1] += alpha*(reward + gamma*maxqnew[1] - q[state][action1])

In [5]:
def train(epsilon):    
    # new q dict
    q = initialize()
    for i_episode in range(episodes):
        done = False
        state = env.reset()
        state = digitize(state)
        state = doHash(state)
        epsilon *= 0.99
        cumulated_reward = 0

        while not done:
            if np.random.uniform() < epsilon:
                action = env.action_space.sample()
            else:
                action = max_dict(q[state])[0]
            
            next_st, reward, done, info = env.step(action)
            next_st = digitize(next_st)
            next_st = doHash(next_st)
            cumulated_reward += reward

            if done:
                reward = -300
            learn(state, action, reward, next_st,q)
            state = next_st
            
        if i_episode%100==0:
            print("Episode = " +str(i_episode) + ", Score = " + str(cumulated_reward))
    return q

In [6]:
Q = train(epsilon)

Episode = 0, Score = 12.0
Episode = 100, Score = 86.0
Episode = 200, Score = 96.0
Episode = 300, Score = 136.0
Episode = 400, Score = 95.0
Episode = 500, Score = 146.0
Episode = 600, Score = 131.0
Episode = 700, Score = 130.0
Episode = 800, Score = 151.0
Episode = 900, Score = 125.0
Episode = 1000, Score = 148.0
Episode = 1100, Score = 162.0
Episode = 1200, Score = 134.0
Episode = 1300, Score = 144.0
Episode = 1400, Score = 159.0
Episode = 1500, Score = 142.0
Episode = 1600, Score = 147.0
Episode = 1700, Score = 146.0
Episode = 1800, Score = 143.0
Episode = 1900, Score = 146.0


In [7]:
# def play(Q):
#     env.reset()
#     for i in range(200):
#         env.render()
#         if i==0:
#             action = env.action_space.sample()
#         else:
#             action = max_dict(q[state])[0]
#         next_st, reward, done, info = env.step(action)
#         print(next_st)
#         state = doHash(digitize(next_st))
#         if done:
#             break
    