# Play MountainCar

In [None]:
import gym
env = gym.make("MountainCar-v0")

# Create a class to play the game

In [None]:
import gym

import math, random
import numpy as np

class GamePlayer:
    def find_square(self, x, y):
        if x > self.obs_space.high[0] or x < self.obs_space.low[0]:
            raise IndexError
        if y > self.obs_space.high[1] or y < self.obs_space.low[1]:
            raise IndexError
        i = len(self.x_range) - 1
        while (x < self.x_range[i] and i > 0):
            i -= 1
        j = len(self.y_range) - 1
        while (y < self.y_range[j] and j > 0):
            j -= 1
        return (i, j)

    def __init__(self):
        self.env = gym.make("MountainCar-v0")
        self.obs_space = self.env.observation_space
        x_step = (self.obs_space.high[0] - self.obs_space.low[0])/100
        y_step = (self.obs_space.high[1] - self.obs_space.low[1])/100
        self.x_range = [self.obs_space.low[0] + j * x_step for j in range(100)]
        self.y_range = [self.obs_space.low[1] + j * y_step for j in range(100)]
        self.speed_range = 100
        self.erase_training()
        
    def erase_training(self):
        self.qtable = np.zeros((len(self.x_range) , len(self.y_range), self.speed_range, self.env.action_space.n))
    
    def start_game(self, render = False):
        state = self.env.reset()
        self.state = self.find_square(state[0], state[1])
        self.speed = 0
        if (render):
            self.env.render()

    def computer_play_step(self):
        action = np.argmax(self.qtable[self.state[0], self.state[1], self.speed,:])
        self.play_game_step(action)
        
    def play_game_step(self, action, render = True):
        new_state, reward, done, info = self.env.step(action)
        new_state = self.find_square(new_state[0], new_state[1])
        grad_x = (new_state[0] - self.state[0])
        grad_y = (new_state[1] - self.state[1])
        self.speed = int(math.sqrt(math.pow(grad_x, 2) + math.pow(grad_y, 2)))
        self.state = new_state
        if (render):
            self.env.render()
        return new_state, reward, done, info

    def end_game(self):
        self.env.close()
        
    def train(self, total_episodes, learning_rate, max_steps, gamma, decay_rate):
        self.start_game(False)
        # Exploration parameters
        epsilon = 1.0                 # Exploration rate
        max_epsilon = 1.0             # Exploration probability at start
        min_epsilon = 0.01            # Minimum exploration probability 
        
        # 2 For life or until learning is stopped
        for episode in range(total_episodes):
            # Reset the environment
            state = self.env.reset()
            state = self.find_square(state[0], state[1])
            grad = [0 , 0]
            max_x = min_x = state[0]
            max_y = min_y = state[1]
            step = 0
            done = False
            total_rewards = 0
            max_right = state[0]

            for step in range(max_steps):
                # 3. Choose an action a in the current world state (s)
                ## First we randomize a number
                exp_exp_tradeoff = random.uniform(0, 1)

                ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
                if exp_exp_tradeoff > epsilon:
                    action = np.argmax(self.qtable[state[0], state[1], self.speed,:])
                    #print(exp_exp_tradeoff, "action", action)

                # Else doing a random choice --> exploration
                else:
                    action = self.env.action_space.sample()
                    #print("action random", action)

                if action > 2:
                    print("c'est bizarre", action)
                # Take the action (a) and observe the outcome state(s') and reward (r)

                new_state, reward, done, info = self.play_game_step(action, False)
                if self.speed > self.speed_range:
                    print("speed limit", self.speed)
                    raise IndexError
                total_rewards += reward
                if reward == -1:
                    if new_state[0] > max_right:
                        reward = 5 * self.speed
                        max_right = new_state[0]
                    else:
                        reward = self.speed

                # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
                self.qtable[state[0], state[1], self.speed, action] += (
                learning_rate * (reward  + gamma * np.max(self.qtable[new_state[0], new_state[1], self.speed, :])
                                  - self.qtable[state[0], state[1], self.speed, action]) )

                # Our new state is state
                self.state = new_state

                # If done (if we're dead) : finish episode
                if done == True:
                    break

            # Reduce epsilon (because we need less and less exploration)
            epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

In [None]:
game = GamePlayer()

total_episodes = 6000       # Total episodes
learning_rate = 0.7        # Learning rate
max_steps = 999             # Max steps per episode
gamma = 0.9                 # Discounting rate
decay_rate = 0.005            # Exponential decay rate for exploration prob
game.erase_training()
game.train(total_episodes, learning_rate, max_steps, gamma, decay_rate)

In [None]:
max_steps = 999
for episode in range(5):
    game.start_game()
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        game.computer_play_step()
        #game.play_game_step(2)
game.end_game()

# Manually play game

In [None]:
from IPython.display import display
import ipywidgets as widgets
from functools import wraps
def yield_for_change(widget):
    def f(iterator):
        @wraps(iterator)
        def inner():
            i = iterator()
            def next_i(change):
                try:
                    i.send(change)
                except StopIteration as e:
                    widget.unobserve(next_i, attribute)
            widget.on_click(next_i)
            # start the generator
            next(i)
        return inner
    return f

gauche = widgets.Button(description="gauche")
droite = widgets.Button(description="droite")
middle = widgets.Button(description="middle")
stop = widgets.Button(description="stop")
button=Button(description="validate")
display(button, gauche, droite, middle, stop)

action = 1
game = GamePlayer()
def on_gauche(b):
    print("gauche")
    action = 2

def on_droite(b):
    print("droite")
    action = 0
    
def on_middle(b):
    print("millieu")
    action = 1
    
def on_stop(b):
    print("game over!")
    game.end_game()

gauche.on_click(on_gauche)
droite.on_click(on_droite)
middle.on_click(on_middle)
stop.on_click(on_stop)

game.start_game()
@yield_for_change(button)
def play():
    for i in range(10):
        game.play_game_step(action)
        print('did work %s'%i)
        button.on_click(lambda i : print('did work %s'%i))
        x = yield
        
play()