In [1]:
# %env THEANO_FLAGS='floatX=float32'
# import os
# if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
#     !bash ../xvfb start
#     %env DISPLAY=:1

In [2]:
import gym
import gym.wrappers

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import theano
import theano.tensor as T

from lasagne.layers import *
from lasagne.nonlinearities import *
from lasagne.objectives import categorical_crossentropy
from lasagne.updates import adam

from IPython.display import HTML
import os



In [3]:
env = gym.make("CartPole-v0")
env.reset()

n_actions = env.action_space.n
state_dim = env.observation_space.shape

In [4]:
current_states = T.matrix("states[batch,units]")
actions = T.ivector("action_ids[batch]")
rewards = T.vector("rewards[batch]")
next_states = T.matrix("next_states[batch,units]")
done = T.ivector("vector[batch] where 1 means that session just ended")

In [5]:
h = 32

x = T.matrix()
target = T.ivector()

l_states = InputLayer((None,)+state_dim, input_var=x)
nn = DenseLayer(l_states, h, nonlinearity=elu)
l_qvalues = DenseLayer(nn, n_actions, nonlinearity=None)

In [6]:
predicted_qvalues = get_output(l_qvalues, {l_states:current_states})
predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]

get_qvalues = theano.function([current_states], predicted_qvalues)

In [7]:
gamma = 0.99

predicted_next_qvalues = get_output(l_qvalues, {l_states: next_states})

target_qvalues_for_actions = rewards + gamma * predicted_next_qvalues.max(axis=1)
target_qvalues_for_actions = (1 - done) * target_qvalues_for_actions
target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)

In [8]:
loss = T.mean((predicted_qvalues_for_actions - target_qvalues_for_actions) ** 2)

all_weights = get_all_params(l_qvalues, trainable=True)
updates = adam(loss, all_weights)

train_step = theano.function([current_states,actions,rewards,next_states,is_end],
                              updates=updates)

### Playing the game

In [9]:
epsilon = 0.25 #initial epsilon

def generate_session(t_max=1000):
    """play env with approximate q-learning agent and train it at the same time"""    
    total_reward = 0
    s = env.reset()
    
    for t in range(t_max):
        q_values = get_qvalues([s])[0]        
        a = np.random.choice(range(n_actions)) if np.random.random() < epsilon else np.argmax(q_values) 
        
        new_s,r,done,info = env.step(a)
        
        train_step([s],[a],[r],[new_s],[done])    
        
        total_reward+=r        
        s = new_s
        
        if done:
            break 
            
    return total_reward        

In [10]:
for i in range(250):    
    rewards = [generate_session() for _ in range(100)] #generate new sessions    
    epsilon*=0.95    
    print ("mean reward:%.3f\tepsilon:%.5f"%(np.mean(rewards),epsilon))
    if np.mean(rewards) > 300:
        print ("You Win!")
        break        
    assert epsilon!=0, "Please explore environment"

mean reward:10.580	epsilon:0.23750
mean reward:11.070	epsilon:0.22562
mean reward:17.670	epsilon:0.21434
mean reward:15.970	epsilon:0.20363
mean reward:18.260	epsilon:0.19345
mean reward:34.170	epsilon:0.18377
mean reward:27.530	epsilon:0.17458
mean reward:38.720	epsilon:0.16586
mean reward:39.520	epsilon:0.15756
mean reward:43.450	epsilon:0.14968
mean reward:72.040	epsilon:0.14220
mean reward:115.370	epsilon:0.13509
mean reward:75.210	epsilon:0.12834
mean reward:153.720	epsilon:0.12192
mean reward:125.860	epsilon:0.11582
mean reward:155.800	epsilon:0.11003
mean reward:137.910	epsilon:0.10453
mean reward:129.090	epsilon:0.09930
mean reward:142.720	epsilon:0.09434
mean reward:129.320	epsilon:0.08962
mean reward:106.890	epsilon:0.08514
mean reward:118.640	epsilon:0.08088
mean reward:105.880	epsilon:0.07684
mean reward:99.380	epsilon:0.07300
mean reward:172.620	epsilon:0.06935
mean reward:84.660	epsilon:0.06588
mean reward:52.260	epsilon:0.06259
mean reward:140.590	epsilon:0.05946
mean re

KeyboardInterrupt: 

### Video

In [None]:
epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training

In [None]:
#record sessions
env = gym.wrappers.Monitor(env,directory="videos",force=True)
sessions = [generate_session() for _ in range(100)]
env.close()
#unwrap 
env = env.env.env
#upload to gym
#gym.upload("./videos/",api_key="<your_api_key>") #you'll need me later

#Warning! If you keep seeing error that reads something like"DoubleWrapError",
#run env=gym.make("CartPole-v0");env.reset();

In [None]:
#show video

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices