In [1]:
import gym
from gym import wrappers
import numpy as np
import matplotlib.pyplot as plt
import operator

env = gym.make('CartPole-v0')

MAXSTATES = 10**4
GAMMA = 0.9
ALPHA = 0.01
COUNTLIMIT = 200

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
def max_dict(d):
    
    max_key, max_v = max(d.items(), key = operator.itemgetter(1))
    
    return max_key, max_v

In [None]:
def create_bins():
    # obs[0] -> cart position --- -4.8 - 4.8
    # obs[1] -> cart velocity --- -inf - inf
    # obs[2] -> pole angle    --- -41.8 - 41.8
    # obs[3] -> pole velocity --- -inf - inf
    
    bins = np.zeros((4,10))
    bins[0] = np.linspace(-4.8, 4.8, 10)
    bins[1] = np.linspace(-5, 5, 10)
    bins[2] = np.linspace(-.418, .418, 10)
    bins[3] = np.linspace(-5, 5, 10)
    
    return bins

In [None]:
def assign_bins(observation, bins):
    
    state = np.zeros(4)
    
    for i in range(4):
        state[i] = np.digitize(observation[i], bins[i])
        
    return state;

In [None]:
def get_state_as_string(state):
    
    string_state = ''.join(str(int(e)) for e in state)
    
    return string_state
    
#     return '{}'.format(state)

In [None]:
def get_all_states_as_string():
    
    states = []
    
    for i in range(MAXSTATES):
        
        states.append(str(i).zfill(4))
        
    return states

In [None]:
def intialize_Q():
    
    Q = {}
    
    all_states = get_all_states_as_string()
    
    for state in all_states:
        
        Q[state] = {}
        
        for action in range(env.action_space.n):
            Q[state][action] = 0
            
    return Q

In [None]:
def play_one_game(env, bins, Q, eps=0.5):
    
    observation = env.reset()
    done = False
    count = 0
   
    state = get_state_as_string(assign_bins(observation, bins))
    total_reward = 0
    
    while not done:
        
        count += 1
        
        if np.random.uniform() < eps:
            action = env.action_space.sample()
        else:
            action = max_dict(Q[state])[0]
            
        observation, reward, done, info = env.step(action)
        
        total_reward += reward
        
        if done and count < COUNTLIMIT:
            reward = -300
            
        state_new = get_state_as_string(assign_bins(observation, bins))
        
        a1, max_q_s1a1 = max_dict(Q[state_new])
        
        Q[state][action] += ALPHA * (reward + GAMMA * max_q_s1a1 - Q[state][action])
        
        state, action = state_new, a1
        
    return total_reward, count   

In [None]:
def play_many_games(bins, N=10000):
    
    Q = intialize_Q()
    
    length = []
    reward = []
    
    for n in range(N):
        
        eps = 1.0 / np.sqrt(n + 1) # take random action less over time
        
        record = n == N - 1
        
        episode_reward, episode_length = play_one_game(env, bins, Q, eps)
        
        if n % 100 == 0:
            print(n, '%.4f' % eps, episode_reward)
            
        length.append(episode_length)
        reward.append(episode_reward)
        
    env1 = wrappers.Monitor(env, 'MovieFiles', force=True)
    observation = env1.reset()
    play_one_game(env1, bins, Q, eps)
        
    return length, reward

In [None]:
def plot_running_avg(total_rewards, run_size):
    
    N = len(total_rewards)
    
    running_avg = np.empty(N)
    
    for t in range(N):
        
        running_avg[t] = np.mean(total_rewards[max(0,t-run_size) : (t+1)])
        
    plt.plot(running_avg)
    plt.title('Running Average')
    plt.show()

In [None]:
# if __name__ == '__main__':

bins = create_bins()
episode_lengths, episode_rewards = play_many_games(bins)

plot_running_avg(episode_rewards, 100)