# Using Q-learning to solve CartPole problem

In [5]:
import gym
import math
import numpy as np

In [6]:
"""
Base code taken from: 
https://github.com/IsaacPatole/CartPole-v0-using-Q-learning-SARSA-and-DNN/blob/master/Qlearning_for_cartpole.py
"""

class CartPoleQAgent():
    def __init__(self, buckets=(3, 3, 6, 6), 
                 num_episodes=200, min_lr=0.1, 
                 min_epsilon=0.1, discount=1.0, decay=25):
        self.buckets = buckets
        self.num_episodes = num_episodes
        self.min_lr = min_lr
        self.min_epsilon = min_epsilon
        self.discount = discount
        self.decay = decay

        self.env = gym.make('CartPole-v0')
        
        # This is the action-value function being initialized to 0's
        self.Q_table = np.zeros(self.buckets + (self.env.action_space.n,))

        # [position, velocity, angle, angular velocity]
        self.upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50) / 1.]
        self.lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50) / 1.]
        

    def discretize_state(self, state):
        """Discretizes the state, transforming the new state into a tuple that can be fed to the Q_table"""
        discretized = list()
        for i in range(len(state)):
            scaling = ((state[i] + abs(self.lower_bounds[i])) 
                       / (self.upper_bounds[i] - self.lower_bounds[i]))
            new_state = int(round((self.buckets[i] - 1) * scaling))
            new_state = min(self.buckets[i] - 1, max(0, new_state))
            discretized.append(new_state)
        return tuple(discretized)

    def choose_action(self, state):
        """Chooses an action to perform, does a random action from time to time because of the epsilon-variable."""
        if (np.random.random() < self.epsilon):
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.Q_table[state])

    def update_q(self, state, action, reward, new_state):
        """Updates Q-table using the Q-formula rule."""
        self.Q_table[state][action] += (self.learning_rate * 
                                        (reward 
                                         + self.discount * np.max(self.Q_table[new_state]) 
                                         - self.Q_table[state][action]))

    def get_epsilon(self, episode):
        """
        Gets value for epsilon. It declines as we advance in episodes.
        This is to done to ensure that the chance of randomly exploring is high
        when we've run a few episodes and goes towards the lowest after a lot of episodes.
        The use of max() and min() ensure that epsilon always is between `self.min_epsilon` and 1.0
        """
        return max(self.min_epsilon, min(1., 1. - math.log10((episode + 1) / self.decay)))

    def get_learning_rate(self, episode):
        """Gets value for learning rate. It declines as we advance in episodes or as we add more episodes."""
        return max(self.min_lr, min(1., 1. - math.log10((episode + 1) / self.decay)))

    def train(self):
        """
        Trains agent making it go through the environment,
        choose actions and update values for its Q-table.
        """
        scores = []
        # Looping for each episode
        for e in range(self.num_episodes):
            # Initializes the state
            current_state = self.discretize_state(self.env.reset())

            self.learning_rate = self.get_learning_rate(e)
            self.epsilon = self.get_epsilon(e)
            done = False

            episode_score = 0
            
            # Do steps until we're done
            while not done:
                # Choose action from the state
                action = self.choose_action(current_state)
                # Do the selected action
                obs, reward, done, _ = self.env.step(action)
                new_state = self.discretize_state(obs)
                # Update the Q_table
                self.update_q(current_state, action, reward, new_state)
                current_state = new_state

                episode_score += reward
                
            scores.append(episode_score)
            print(f"{scores[e]} score for episode {e+1}")
        print('Finished training!')

    def run(self):
        """Runs an episode while displaying the cartpole environment."""
        self.env = gym.wrappers.Monitor(self.env,'cartpole', force=True)
        done = False
        current_state = self.discretize_state(self.env.reset())
        score = 0.0
        while not done:
                self.env.render()
                action = self.choose_action(current_state)
                obs, reward, done, _ = self.env.step(action)
                score += reward
                new_state = self.discretize_state(obs)
                current_state = new_state
            
        print(f"Score: {str(score)}")
        self.env.close()

### Train agent and run
The following function trains the CartPoleAgent and then runs one round visualized

In [7]:
agent = CartPoleQAgent()
agent.train()

18.0 score for episode 1
24.0 score for episode 2
22.0 score for episode 3
37.0 score for episode 4
27.0 score for episode 5
15.0 score for episode 6
18.0 score for episode 7
9.0 score for episode 8
40.0 score for episode 9
51.0 score for episode 10
16.0 score for episode 11
13.0 score for episode 12
19.0 score for episode 13
12.0 score for episode 14
18.0 score for episode 15
15.0 score for episode 16
20.0 score for episode 17
10.0 score for episode 18
26.0 score for episode 19
40.0 score for episode 20
16.0 score for episode 21
21.0 score for episode 22
25.0 score for episode 23
31.0 score for episode 24
10.0 score for episode 25
33.0 score for episode 26
10.0 score for episode 27
28.0 score for episode 28
10.0 score for episode 29
25.0 score for episode 30
39.0 score for episode 31
17.0 score for episode 32
50.0 score for episode 33
9.0 score for episode 34
20.0 score for episode 35
13.0 score for episode 36
18.0 score for episode 37
23.0 score for episode 38
60.0 score for episode 

In [8]:

agent.run()

Score: 200.0
