## Init

In [11]:
import gym
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm, trange

%matplotlib inline

## Simple implementation

In [None]:
env = gym.make('CartPole-v1')

In [None]:
# TRAINING
MAX_MOVES = 500 #Cartpole-v1

best_length = 0
best_weights = None

for weights in tqdm(range(100)):
    weights_arr = np.random.uniform(-1,1,4)
    game_lengths = []

    for game in range(100):

        observation = env.reset()
        for moves in range(MAX_MOVES):
            # env.render()
            action = 1 if(np.dot(weights_arr, observation) > 0) else 0 
            observation, reward, done, info = env.step(action)

            if(done):
                game_lengths.append(moves)
                break

    average_length = sum(game_lengths) / len(game_lengths)
    if (average_length > best_length):
        best_length = average_length
        best_weights = weights_arr

print("\nBEST LENGTH: ",best_length)

In [None]:
# TESTING
weights_arr = best_weights

observation = env.reset()
for moves in tqdm(range(MAX_MOVES)):
    env.render()
    action = 1 if(np.dot(weights_arr, observation) > 0) else 0 
    observation, reward, done, info = env.step(action)

    if(done):
        print("MOVES: ",moves)
        break

env.close()

## Q-Learning

In [None]:
env = gym.make('CartPole-v0')

In [None]:
# DISCRETIZING CONTINUOUS SPACE
def create_bins():
    bins = np.zeros((4,10))
    bins[0] = np.linspace(-2.4, 2.4, 10)
    bins[1] = np.linspace(-5, 5, 10)
    bins[2] = np.linspace(-0.418, 0.418, 10)
    bins[3] = np.linspace(-5, 5, 10)
    return bins

# GIVEN OBSERVATION ARRAY AND BINS, RETURNS STATE NUMBER
def get_state(observation, bins):
    state = np.zeros(4)
    for i in range(4):
        state[i] = np.digitize(observation[i], bins[i]) - 1
    string_state = ''.join(str(int(n)) for n in state)
    return int(string_state)


In [None]:
#PLAY SINGLE EPISODE
def play_episode(Q, bins, epsilon):

    observation = env.reset()
    state = get_state(observation, bins)
    total_reward = 0

    for moves in range(MAX_MOVES):
        # e-GREEDY
        if (np.random.uniform() < epsilon):
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])
        # TAKE ACTION
        observation, reward, done, _ = env.step(action)

        # PUNISH FOR FAILURE
        if( done and (moves < (MAX_MOVES - 25)) ):
            reward = -200
        total_reward += reward

        # UPDATE Q-VALUES (greedify)
        next_state = get_state(observation, bins)
        next_bestaction = np.argmax(Q[next_state])
        next_Q = Q[next_state][next_bestaction]
        Q[state][action] += (ALPHA * (reward + (GAMMA * next_Q) - Q[state][action]))

        state = next_state

        # BREAK IF TERMINATED
        if(done): break 

    return (round(total_reward), round(moves))

In [14]:
# INIT
MAX_STATES = 10 ** 4
MAX_MOVES = 200 #Cartpole-v0
NUM_EPISODES = 3000
GAMMA = 0.9
ALPHA = 0.01
EPSILON = 0.5

Q = np.zeros((MAX_STATES, env.action_space.n))
episode_rewards = []
episode_lengths = []
bins = create_bins()

In [None]:
#TRAINING

# progress = tqdm(range(NUM_EPISODES))
for episode_n in range(NUM_EPISODES):
    EPSILON = 1 / np.sqrt(episode_n + 1)

    episode_reward, episode_length = play_episode(Q, bins, EPSILON)
    episode_rewards.append(episode_reward)
    episode_lengths.append(episode_length)

    if(episode_n%100 == 0):
        print("REWARD: ",episode_reward ,"\tLENGTH: ",episode_length)

In [None]:
# PLOT REWARDS CURVE
plt.plot(np.arange(NUM_EPISODES), episode_rewards)

In [12]:
# TESTING
observation = env.reset()
state = get_state(observation, bins)
for moves in tqdm(range(MAX_MOVES)):
    env.render()
    action = np.argmax(Q[state])
    # TAKE ACTION
    observation, reward, done, _ = env.step(action)
    state = get_state(observation, bins)
    
    if(done):
        print("MOVES: ",moves)
        break

env.close()

100%|█████████▉| 199/200 [00:03<00:00, 58.96it/s]MOVES:  199

