# Play MountainCar

In [None]:
from gym.utils import play
import gym
env = gym.make("MountainCar-v0")
play.play(env)
env.close()

# Training

In [None]:
from qlearning import *
import gym

def state_function(state):
    return (round(state[0], 1) , round(state[1], 2))
env = gym.make("MountainCar-v0")    
game = GamePlayer(env, state_function)

In [None]:
total_episodes = 4000
alpha = 0.2                # Learning rate
gamma = 0.9                 # Discounting rate
decay_rate = 0.0005        # Exponential decay rate for exploration prob
epsilon = 0.9                 # Exploration rate
#game.erase_training()
rewards = game.train(total_episodes, alpha, gamma, epsilon, decay_rate, logEvery=1000)
print("Total reward average:", sum(rewards)/len(rewards))
print(len(game.qtable))

In [None]:
action_function = lambda state: game.q_trained_action(state_function(state))
visualize_computer_playing(5, env, action_function)

## Matplotlib display

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
xmin, ymin = tuple(env.observation_space.low)
xmax, ymax = tuple(env.observation_space.high)
X = [ x for x in np.arange(xmin, xmax, 0.05)]
Y = [ y for y in np.arange(ymin, ymax, 0.005)]
colors = 'bgry'

In [None]:
action_function = lambda state: game.q_trained_action(state_function(state))
for x in X:
    for y in Y:
        plt.scatter(x, y, c=colors[action_function((x, y))], label='.')
plt.show()

# Using keras Q-training

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import gym

In [None]:
env = gym.make("MountainCar-v0")
# Redefine rewards
import types

env.reset_backup = env.reset
env.step_backup = env.step
env.episode_step = 0

def reset(self):
    self.episode_step = 0
    return self.reset_backup()
    
def step(self, action):
    self.episode_step += 1
    state, reward, done, info = self.step_backup(action)
    if done and self.episode_step < 200:
        reward = 10+200-self.episode_step
    return state, reward, done, info

env.reset = types.MethodType(reset, env)
env.step = types.MethodType(step, env)

In [None]:
from collections import deque
import numpy as np

class DoneMemory():
    def __init__(self):
        self.mem_done = qlearning.Memory(2000)
        self.mem_not_done = qlearning.Memory(200000)
    
    def add(self, experience):
        state, action, reward, done, next_state = experience
        if done:
            self.mem_done.add(experience)
            return
        self.mem_not_done.add(experience)
    
    def sample(self, batch_size):
        len_done = min(len(self.mem_done), int(batch_size/10))
        samples = self.mem_not_done.sample(batch_size-len_done)
        samples += self.mem_done.sample(len_done)
        return samples

In [None]:
game = qlearning.GamePlayer(env)
#game.memory = DoneMemory()

In [None]:
def play_function(state):
    play_function.i += 1
    if play_function.i > 130:
        play_function.i = 0
    if play_function.i < 20:
        return 0
    elif play_function.i < 50:
        return 2
    elif play_function.i < 90:
        return 0
    else:
        return 2

play_function.i = 0

qlearning.visualize_computer_playing(1, env, play_function)

In [None]:
game.off_policy_model_train(30, play_function, layers_size=[24, 24, 24, 24, 24, 24], logEvery=100, trainQModel=True)
qlearning.visualize_computer_playing(1, env, game.keras_qtrained_action)

In [None]:
import numpy as np
batch = game.memory.sample(2000)
for state, action, reward, done, nstate in batch:
    prediction = game.qModel.predict(np.array(state).reshape(1, env.observation_space.shape[0]))
    if done and reward <= 0:
        print(state, action, nstate, reward, game.keras_qtrained_action(state), prediction)
    if reward >=0:
        print("Wow", state, action, reward, done,
              game.keras_qtrained_action(state), game.keras_qtrained_action(nstate), prediction)

In [None]:
N = 200
total_episodes = 400
game.keras_qTrain(N, total_episodes, alpha=0.001, gamma=0.9, layers_size=[50, 250, 50],
                      decay_rate=0.9995, epsilon=0.5, logEvery=20)

In [None]:
qlearning.visualize_computer_playing(5, env, game.keras_qtrained_action)

## Matplotlib display

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
xmin, ymin = tuple(env.observation_space.low)
xmax, ymax = tuple(env.observation_space.high)
X = [ x for x in np.arange(xmin, xmax, 0.05)]
Y = [ y for y in np.arange(ymin, ymax, 0.005)]
colors = 'bgry'

In [None]:
for x in X:
    for y in Y:
        plt.scatter(x, y, c=colors[game.keras_qtrained_action((x, y))], label='o')
plt.show()

# Deep Q-Learning with Model evaluation

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import gym

In [None]:
env = gym.make("MountainCar-v0")
# Redefine rewards
import types

env.reset_backup = env.reset
env.step_backup = env.step
env.episode_step = 0

def reset(self):
    self.episode_step = 0
    return self.reset_backup()
    
def step(self, action):
    self.episode_step += 1
    state, reward, done, info = self.step_backup(action)
    if done and self.episode_step < 200:
        reward = 10+200-self.episode_step
    return state, reward, done, info

env.reset = types.MethodType(reset, env)
env.step = types.MethodType(step, env)

In [None]:
game = qlearning.GamePlayer(env)

In [None]:
def play_function(state):
    play_function.i += 1
    if play_function.i > 130:
        play_function.i = 0
    if play_function.i < 20:
        return 0
    elif play_function.i < 50:
        return 2
    elif play_function.i < 90:
        return 0
    else:
        return 2

play_function.i = 0

qlearning.visualize_computer_playing(1, env, play_function)

In [None]:
game.off_policy_model_train(30, play_function, layers_size=[24, 24, 24, 24, 24, 24], logEvery=100, 
                            trainTransitionModel=True, trainQModel=True)
qlearning.visualize_computer_playing(1, env, game.keras_qtrained_modelTrained_action)

In [None]:
N = 200
total_episodes = 400
game.keras_qTrain_modelTrain(N, total_episodes, alpha=0.001, gamma=0.9, layers_size=[50, 250, 50],
                      decay_rate=0.9995, epsilon=0.5, logEvery=20)

In [None]:
qlearning.visualize_computer_playing(3, env, game.keras_qtrained_modelTrained_action)

In [None]:
import numpy as np
batch = game.memory.sample(2000)
for state, action, reward, done, nstate in batch:
    prediction = game.qModel.predict(np.array(state).reshape(1, env.observation_space.shape[0]))
    S = np.array(state).reshape(1, 2)
    A = np.array(action).reshape(1, 1)
    vstate = game.transitionModel.predict(np.concatenate((S, A), axis=1))[0]
    if done and reward <= 0:
        print(state, action, nstate, reward, game.keras_qtrained_modelTrained_action(state), prediction)
        print("V(S)", vstate, [vstate[0]-state[0], vstate[1]-state[1]] )
    if reward >=0:
        print("V(S), next_state", vstate, nstate, [(vstate[0]-state[0])/state[0], (vstate[1]-state[1])/state[1]] )
        print("Wow", state, action, reward, done,
              game.keras_qtrained_action(state), game.keras_qtrained_modelTrained_action(nstate), prediction)

## Matplotlib display

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
xmin, ymin = tuple(env.observation_space.low)
xmax, ymax = tuple(env.observation_space.high)
X = [ x for x in np.arange(xmin, xmax, 0.05)]
Y = [ y for y in np.arange(ymin, ymax, 0.005)]
colors = 'bgry'

In [None]:
for x in X:
    for y in Y:
        plt.scatter(x, y, c=colors[game.keras_qtrained_modelTrained_action((x, y))], label='o')
plt.show()

# Deep state value learning

In [None]:
import importlib
import qlearning
importlib.reload(qlearning)
import gym

In [None]:
env = gym.make("MountainCar-v0")
# Redefine rewards
import types

env.reset_backup = env.reset
env.step_backup = env.step
env.episode_step = 0

def reset(self):
    self.episode_step = 0
    return self.reset_backup()
    
def step(self, action):
    self.episode_step += 1
    state, reward, done, info = self.step_backup(action)
    if done and self.episode_step < 200:
        reward = 10+200-self.episode_step
    return state, reward, done, info

env.reset = types.MethodType(reset, env)
env.step = types.MethodType(step, env)

In [None]:
game = qlearning.GamePlayer(env)

In [None]:
N = 200
total_episodes = 400
game.keras_vTrain_modelTrain(N, total_episodes, alpha=0.001, gamma=0.9, layers_size=[50, 250, 50],
                      decay_rate=0.9995, epsilon=0.5, logEvery=20)

In [None]:
qlearning.visualize_computer_playing(3, env, game.keras_vtrained_action)

In [None]:
import numpy as np
batch = game.memory.sample(2000)
for state, action, reward, done, nstate in batch:
    if done:
        s = np.array(state).reshape(1, 2)
        ns = np.array(nstate).reshape(1, 2)
        prediction = game.vModel.predict(s)[0][0]
        sprediction = game.vModel.predict(ns)[0][0]
        if reward < 0:
            print(state, action, nstate, reward, game.keras_vtrained_action(state), prediction, sprediction)
        if reward >=0:
            print("Wow", state, action, reward, done,
                  game.keras_vtrained_action(state), prediction, sprediction)

## Matplotlib display

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
xmin, ymin = tuple(env.observation_space.low)
xmax, ymax = tuple(env.observation_space.high)
X = [ x for x in np.arange(xmin, xmax, 0.05)]
Y = [ y for y in np.arange(ymin, ymax, 0.005)]
colors = 'bgry'

In [None]:
for x in X:
    for y in Y:
        plt.scatter(x, y, c=colors[game.keras_vtrained_action((x, y))], label='o')
plt.show()