### Function Approximation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tempfile
import base64
import pprint
import random
import json
import sys
import gym
import io

from gym import wrappers
from collections import deque
from subprocess import check_output
from IPython.display import HTML

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import RMSprop

In [None]:
def action_selection(state, model, episode, n_episodes):
    epsilon = 0.99 if episode < n_episodes//4 else 0.33 if episode < n_episodes//2 else 0.
    values = model.predict(state.reshape(1, 4))[0]
    if np.random.random() < epsilon:
        action = np.random.randint(len(values))
    else:
        action = np.argmax(values)
    return action, epsilon

In [None]:
def neuro_q_learning(env, gamma = 0.9):
    nS = env.observation_space.shape[0]
    nA = env.env.action_space.n
    
    memory_bank = deque()
    memory_bank_size = 5000
    
    model = Sequential()
    model.add(Dense(128, input_dim=nS, activation='relu'))
    model.add(Dense(64, activation='relu', init='uniform'))
    model.add(Dense(nA, activation='linear'))
    model.compile(loss='mse', optimizer='adam')

    n_episodes = 50000
    batch_size = 64
    training_frequency = 25
    
    epsilons = []
    states = []
    actions = []
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            states.append(state)
            
            action, epsilon = action_selection(state, model, episode, n_episodes)
            epsilons.append(epsilon)
            actions.append(action)
            
            nstate, reward, done, info = env.step(action)
            memory_bank.append((state, action, reward, nstate, done))
            if len(memory_bank) > memory_bank_size:
                memory_bank.popleft()
                
            state = nstate

        if episode % training_frequency == 0 and len(memory_bank) > batch_size:
            minibatch = np.array(random.sample(memory_bank, batch_size))

            state_batch = np.array(minibatch[:,0].tolist())
            action_batch = np.array(minibatch[:,1].tolist())
            rewards_batch = np.array(minibatch[:,2].tolist())
            state_prime_batch = np.array(minibatch[:,3].tolist())
            is_terminal_batch = np.array(minibatch[:,4].tolist())

            state_value_batch = model.predict(state_batch)
            next_state_value_batch = model.predict(state_prime_batch)

            for i in range(len(minibatch)):
                if is_terminal_batch[i]:
                    state_value_batch[i, action_batch[i]] = rewards_batch[i]
                else:
                    state_value_batch[i, action_batch[i]] = rewards_batch[i] + gamma * np.max(next_state_value_batch[i])

            model.train_on_batch(state_batch, state_value_batch)

    return model, (epsilons, states, actions)

In [None]:
mdir = tempfile.mkdtemp()
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, mdir, force=True)

model, stats = neuro_q_learning(env)

[2017-04-23 02:04:58,545] Making new env: CartPole-v0
[2017-04-23 02:04:58,871] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0.1201.video000000.mp4
[2017-04-23 02:05:00,236] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0.1201.video000001.mp4
[2017-04-23 02:05:00,739] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0.1201.video000008.mp4
[2017-04-23 02:05:02,804] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0.1201.video000027.mp4
[2017-04-23 02:05:03,952] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0.1201.video000064.mp4
[2017-04-23 02:05:05,876] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0.1201.video000125.mp4
[2017-04-23 02:05:08,640] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0.1201.video000216.mp4
[2017-04-23 02:05:12,901] Starting new video recorder writing to /tmp/tmp9b25_tg_/openaigym.video.0

In [None]:
videos = np.array(env.videos)
n_videos = 10

idxs = np.linspace(0, len(videos) - 1, n_videos).astype(int)
videos = videos[idxs,:]

strm = ''
for video_path, meta_path in videos:
    video = io.open(video_path, 'r+b').read()
    encoded = base64.b64encode(video)
    
    with open(meta_path) as data_file:    
        meta = json.load(data_file)

    html_tag = """
    <h2>{0}<h2/>
    <video width="960" height="540" controls>
        <source src="data:video/mp4;base64,{1}" type="video/mp4" />
    </video>"""
    strm += html_tag.format('Episode ' + str(meta['episode_id']), encoded.decode('ascii'))
HTML(data=strm)

In [None]:
epsilons, states, actions = stats

In [None]:
env.close()

In [None]:
gym.upload(mdir, api_key='sk_2Z7ZMK8RRaiIU5WVirHYGg')

In [None]:
plt.plot(np.arange(len(epsilons)), epsilons, '.')

In [None]:
hist, bins = np.histogram(actions, bins=3)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.show()

### Test

In [None]:
mdir = tempfile.mkdtemp()
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, mdir, force=True)

for episode in range(1000):
    state = env.reset()
    done = False
    while not done:
        action = np.argmax(model.predict(state.reshape(1, 4))[0])
        nstate, reward, done, info = env.step(action)
        state = nstate

env.close()

In [None]:
gym.upload(mdir, api_key='sk_2Z7ZMK8RRaiIU5WVirHYGg')

In [None]:
gym.upload(mdir, api_key='<YOUR API KEY>')