# Play MountainCar

In [None]:
from gym.utils import play
import gym
env = gym.make("MountainCar-v0")
play.play(env)
env.close()

# Q Learning without state decomposition

In [None]:
import gym

import random
from math import log10, floor
import numpy as np

def round_to(x, sig_figs):
    if x == 0.0:
        return 0.0
    if x < 0.0:
        return round(x, -int(floor(log10(abs(-x))) - (sig_figs - 1)))
    return round(x, -int(floor(log10(abs(x))) - (sig_figs - 1)))

class GamePlayer:

    def __init__(self):
        self.env = gym.make("MountainCar-v0")
        self.qtable = {}
        
    def erase_training(self):
        self.qtable = {}
    
    def Q(self, state):
        s = round_to(100000.0*(state[0]+2.0), 2) + round_to(1.0+state[1], 2)
        if s not in self.qtable:
            self.qtable[s] = []
            for i in range(self.env.action_space.n):
                self.qtable[s].append(random.uniform(0, 1))
        return self.qtable[s]
#         s = [ round_to(st, 2) for st in state ]
#         if (s[0], s[1]) not in self.qtable:
#             self.qtable[(s[0], s[1])] = []
#             for i in range(self.env.action_space.n):
#                 self.qtable[(s[0], s[1])].append(random.uniform(0, 1))
#         return self.qtable[(s[0], s[1])]
    
    def epison_q_action(self, state, epsilon):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)

        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(self.Q(state))
        # Else doing a random choice --> exploration
        else:
            action = self.env.action_space.sample()
        return action
    
    def start_game(self, render = False):
        state = self.env.reset()
        if (render):
            self.env.render()

    def computer_play_step(self):
        action = np.argmax(self.Q(self.state))
        return self.play_game_step(action)
        
    def play_game_step(self, action, render = True):
        new_state, reward, done, info = self.env.step(action)
        self.state = new_state
        if (render):
            self.env.render()
        return new_state, reward, done, info

    def end_game(self):
        self.env.close()
        
        
    def train(self, total_episodes, alpha, gamma, epsilon, decay_rate):
        self.start_game(False)
        # Exploration parameters
        max_epsilon = 1.0             # Exploration probability at start
        min_epsilon = 0.01            # Minimum exploration probability 
        
        # 2 For life or until learning is stopped
        for episode in range(total_episodes):
            # Reset the environment
            state = self.env.reset()
            done = False
            while done is False:
                action = self.epison_q_action(state, epsilon)
                if action > 2:
                    raise IndexError
                # Take the action (a) and observe the outcome state(s') and reward (r)
                new_state, reward, done, info = self.play_game_step(action, False)
                
                if reward > 0:
                    print("Youhou !!")
                # dirty ugly cheat witchcraft that does not even work
                # reward += abs(new_state[0])+10.0*abs(new_state[1])

                # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
                self.Q(state)[action] = self.Q(state)[action] + alpha * (reward  + gamma * max(self.Q(new_state)) - self.Q(state)[action])

                # Our new state is state
                self.state = new_state

                # If done (if we're dead) : finish episode
                if done == True:
                    break

            # Reduce epsilon (because we need less and less exploration)
            #epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*episode)

In [None]:
game = GamePlayer()

In [None]:
total_episodes = 1500       # Total episodes
alpha = 0.2                # Learning rate
gamma = 0.9                 # Discounting rate
decay_rate = 0.5          # Exponential decay rate for exploration prob
epsilon = 0.1                 # Exploration rate
#game.erase_training()
game.train(total_episodes, alpha, gamma, epsilon, decay_rate)
print(len(game.qtable))

In [None]:
for episode in range(5):
    game.start_game()
    print("****************************************************")
    print("EPISODE ", episode)
    done = False
    while done is False:
    # for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        new_state, reward, done, info = game.computer_play_step()
        #game.play_game_step(0)
game.end_game()
if reward > 0:
    print("Youhou !!")

In [None]:
game.qtable

In [None]:
print(env.observation_space.high, env.observation_space.low)