# Play MountainCar

In [None]:
from gym.utils import play
import gym
env = gym.make("MountainCar-v0")
play.play(env)
env.close()

# Q Learning without state decomposition

In [None]:
import gym

import random
from math import log10, floor
import numpy as np

def round_to(x, sig_figs):
    if x == 0.0:
        return 0.0
    if x < 0.0:
        return round(x, -int(floor(log10(abs(-x))) - (sig_figs - 1)))
    return round(x, -int(floor(log10(abs(x))) - (sig_figs - 1)))

class GamePlayer:

    def __init__(self):
        self.env = gym.make("MountainCar-v0")
        self.qtable = {}
        
    def erase_training(self):
        self.qtable = {}
    
    def Q(self, state):
        s = (round(state[0], 1) , round(state[1], 2))
        if s not in self.qtable:
            self.qtable[s] = []
            for i in range(self.env.action_space.n):
                self.qtable[s].append(0.0)
        return self.qtable[s]
    
    def epison_q_action(self, state, epsilon):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)

        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(self.Q(state))
        # Else doing a random choice --> exploration
        else:
            action = self.env.action_space.sample()
        return action
    
    def start_game(self, render = False):
        state = self.env.reset()
        if (render):
            self.env.render()
        return state

    def computer_play_step(self, state):
        action = np.argmax(self.Q(state))
        return self.play_game_step(action)
        
    def play_game_step(self, action, render = True):
        new_state, reward, done, info = self.env.step(action)
        if (render):
            self.env.render()
        return new_state, reward, done, info

    def end_game(self):
        self.env.close()
        
        
    def train(self, total_episodes, alpha, gamma, epsilon, decay_rate):
        self.start_game(False)
        # Exploration parameters
        max_epsilon = 0.8             # Exploration probability at start
        min_epsilon = 0.01            # Minimum exploration probability 
        
        has_won = False
        reward_list = []
        tot_reward_list = []
        # 2 For life or until learning is stopped
        for episode in range(total_episodes):
            # Reset the environment
            state = self.env.reset()
            done = False
            tot_reward = 0
            while done is False:
                action = self.epison_q_action(state, epsilon)
                if action > 2:
                    raise IndexError
                # Take the action (a) and observe the outcome state(s') and reward (r)
                new_state, reward, done, info = self.play_game_step(action, False)

                # dirty ugly cheat witchcraft that does not even work
                # reward += abs(new_state[0])+10.0*abs(new_state[1])

                # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
                self.Q(state)[action] += alpha * (reward  + gamma * max(self.Q(new_state)) - self.Q(state)[action])

                # Our new state is state
                state = new_state
                tot_reward += reward
            reward_list.append(tot_reward)
            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*episode)
            
            if (episode+1) % 100 == 0:
                ave_reward = np.mean(reward_list)
                tot_reward_list.append(ave_reward)
                reward_list = []
                print('Episode {} Average Reward: {}'.format(episode+1, ave_reward))
        return tot_reward_list

In [None]:
game = GamePlayer()

In [None]:
total_episodes = 4000
alpha = 0.2                # Learning rate
gamma = 0.9                 # Discounting rate
decay_rate = 0.5          # Exponential decay rate for exploration prob
epsilon = 0.8                 # Exploration rate
#game.erase_training()
rewards = game.train(total_episodes, alpha, gamma, epsilon, decay_rate)
print("Total reward average:", sum(rewards)/len(rewards))
print(len(game.qtable))

In [None]:
for episode in range(5):
    state = game.start_game()
    print("****************************************************")
    print("EPISODE ", episode)
    done = False
    tot_reward = 0
    while done is False:
    # for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        new_state, reward, done, info = game.computer_play_step(state)
        #game.play_game_step(0)
        state = new_state
        tot_reward += reward
    print("Reward:", tot_reward)
game.end_game()

In [None]:
print(game.qtable)

In [None]:
print(game.env.observation_space.high, game.env.observation_space.low)

In [None]:
import matplotlib.pyplot as plt
x = [k[0] for k in game.qtable]
y = [k[1] for k in game.qtable]
z = [np.argmax(game.qtable[k]) for k in game.qtable]

# markers = ['g', '.', 'd']
# for i in range(len(z)):
#     plt.plot(x[i], y[i], markers[z[i]])
plt.scatter(x, y, c=z)

In [None]:
h = [max(game.qtable[k]) for k in game.qtable]
plt.scatter(x, h, c=z)

In [None]:
# code from https://gist.github.com/gkhayes/3d154e0505e31d6367be22ed3da2e955
import numpy as np
import gym
import matplotlib.pyplot as plt

# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
env.reset()

# Define Q-learning function
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
    # Determine size of discretized state space
    num_states = (env.observation_space.high - env.observation_space.low)*\
                    np.array([10, 100])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialize Q table
    Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialize variables to track rewards
    reward_list = []
    ave_reward_list = []
    
    # Calculate episodic reduction in epsilon
    reduction = (epsilon - min_eps)/episodes
    
    # Run Q learning algorithm
    for i in range(episodes):
        # Initialize parameters
        done = False
        tot_reward, reward = 0,0
        state = env.reset()
        
        # Discretize state
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
    
        while done != True:   
            # Render environment for last five episodes
            if i >= (episodes - 20):
                env.render()
                
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]]) 
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Get next state and reward
            state2, reward, done, info = env.step(action) 
            
            # Discretize state2
            state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)
            
            #Allow for terminal states
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                
            # Adjust Q value for current state
            else:
                delta = learning*(reward + 
                                 discount*np.max(Q[state2_adj[0], 
                                                   state2_adj[1]]) - 
                                 Q[state_adj[0], state_adj[1],action])
                Q[state_adj[0], state_adj[1],action] += delta
                                     
            # Update variables
            tot_reward += reward
            state_adj = state2_adj
        
        # Decay epsilon
        if epsilon > min_eps:
            epsilon -= reduction
        
        # Track rewards
        reward_list.append(tot_reward)
        
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []
            
        if (i+1) % 100 == 0:    
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
            
    env.close()
    
    return ave_reward_list

# Run Q-learning algorithm
rewards = QLearning(env, 0.2, 0.9, 0.8, 0, 5000)

# Plot Rewards
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('rewards.jpg')     
plt.close() 