In [4]:
import gym
import numpy as np
import matplotlib as plt

In [5]:
env = gym.make('Acrobot-v1')
env.reset()

array([ 0.99833847, -0.05762198,  0.99994075,  0.01088571, -0.01654869,
       -0.05597058])

In [7]:
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample())
    
env.close()

# Action and State 

Reinforcement Learning will learn a mapping of states to the optimal action to perform in that state by exploration, i.e. the agent explores the environment and takes actions based off rewards defined in the environment. [source](https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/)

- **Action**, input provided by the agent to the environment   
Here, left, nothing or right represented as +1, 0 or -1)  
  
  
- **State**, numeric representation of what the agent is observing at a particular time in the environment  
Here, the state consists of the sin() and cos() of the two rotational joint angles and the joint angular velocities : [cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2]. For the first link, an angle of 0 corresponds to the link pointing downwards. The angle of the second link is relative to the angle of the first link. An angle of 0 corresponds to having the same angle between the two links. A state of [1, 0, 1, 0, ..., ...] means that both links point downwards.

In [10]:
env.reset() # reset environment to a new, random state
env.render()
env.close()

print("Action Space (number of input possibility by agent user) {}".format(env.action_space))  
print("State Space (encoding of the curent state to be mapped) {}".format(env.observation_space))

Action Space (number of input possibility by agent user) Discrete(3)
State Space (encoding of the curent state to be mapped) Box(6,)


# Q-learning

![image.png](attachment:image.png)

To define the maximum expected cumulative award for given pair with hyperparameters : 
- learning rate 
- discount factor

The Q learning equation maps state-action pairs to a maximum with combination of immediate reward plus future rewards i.e. for new states learned value is reward plus future estimate of rewards.

# Adapt the Qlearning function 

![image.png](attachment:image.png)

**from [moutain_car exemple](https://gist.github.com/gkhayes/3d154e0505e31d6367be22ed3da2e955)



In [88]:
# Determine size of discretized state space
num_states = (env.observation_space.high - env.observation_space.low) * np.array([1, 1, 1, 1, 1, 1]) #multiplication du state incrementé
num_states = np.round(num_states, 0).astype(int) + 1
num_states

array([ 3,  3,  3,  3, 26, 58])

In [90]:
# Initialize Q table
Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
Q

array([[[ 0.98355083,  0.39541981, -0.78016757],
        [-0.89961449, -0.35884831, -0.65678346],
        [ 0.67878278,  0.35797729,  0.29912764]],

       [[-0.1907013 , -0.51995776,  0.36075164],
        [ 0.5699781 , -0.66536113, -0.1319045 ],
        [-0.25085143,  0.66175485,  0.15472704]],

       [[-0.48174715,  0.30313676,  0.59120972],
        [-0.42586887,  0.14861999,  0.89364987],
        [ 0.0780876 , -0.25447075, -0.32706802]]])

In [91]:
# Initialize variables to track rewards
reward_list = []
ave_reward_list = []

# Hyperparameters (1/2) :


We define **epsilon**, the exploration rate of different possibilities (set to 1 at the beginning).  
Then randomly, if **epsilon** is less than this random number, we will explore the possible path.   
Start = big **epsilon** 
Progressively = reduce the **epsilon** as the agent estimates the Q-values more precisely (the lowest the epsilon, the more chances to select the best option (overfit))

**min_eps** :

**episodes** :

In [92]:
# Initialize epsilon at 1
epsilon = 0.2       # the lowest the epsilon, the more chances to select the best option (overfit) - the lower the more chances to choose the next action at random (here 20% of random choice)
min_eps = 0.05
episodes = 5000 # episodes : will reduce the impact of epsilon every run (handles the progress)

# Calculate episodic reduction in epsilon 
reduction = (epsilon - min_eps) / episodes

# 3 basic steps of Qlearning :

1. Agent starts in a state (s1) takes an action (a1) and receives a reward (r1)
2. Agent selects action by referencing Q-table with highest value (max) OR by random (epsilon, ε)
3. Update q-values

# Hyperparameters (2/2) :

**learning**: lr or learning rate (alpha in the equation, α), can simply be defined as how much you accept the new value vs the old value. Above we are taking the difference between new and old and then multiplying that value by the learning rate. This value then gets added to our previous q-value which essentially moves it in the direction of our latest update.

**discount**: (gamma in the equation, γ) The discount factor is used to balance immediate and future reward. We apply the discount to the future reward upon update. Typically this value can range anywhere from 0.8 to 0.99.

In [87]:
learning= 0.2 # learning rate
discount = 0.9 # discount rate

# Run Q learning algorithm
for i in range(episodes):
    # Initialize parameters
    done = False
    tot_reward, reward = 0,0
    state = env.reset()

    # Discretize state
    state_adj = (state - env.observation_space.low) * np.ones((6,))
    state_adj = np.round(state_adj, 0).astype(int)
    
    while done != True:   
        # Render environment for last 5 episodes
        if i >= (episodes - 5):
            env.render()

        # Determine next action - epsilon greedy strategy
        if np.random.random() < 1 - epsilon:         # if random inferior to 1-epsilon (epsilon has to be between 0.0001-0.999) 
            action = np.argmax(Q[state_adj[0], state_adj[1]])  
        else:
            action = np.random.randint(0, env.action_space.n)

        # Get next state and reward
        state2, reward, done, info = env.step(action) 

        # Discretize state2
        state2_adj = (state2 - env.observation_space.low) * np.ones((6,))
        state2_adj = np.round(state2_adj, 0).astype(int)

        #Allow for terminal states
        if done and state2[0] >= 0.5:
            Q[state_adj[0], state_adj[1], action] = reward

        # Adjust Q value for current state
        else:
            delta = learning*(reward + 
                             discount*np.max(Q[state2_adj[0],state2_adj[1]]) - 
                             Q[state_adj[0], state_adj[1],action])
            Q[state_adj[0], state_adj[1],action] += delta

        # Update variables
        tot_reward += reward
        state_adj = state2_adj

    # Decay epsilon             ====  reduce the epsilon as the agent estimates the Q-values more precisely
    if epsilon > min_eps:
        epsilon -= reduction

    # Track rewards
    reward_list.append(tot_reward)

    if (i+1) % 100 == 0:           # every 100 episodes, get the averaged reward printed on the list
        ave_reward = np.mean(reward_list)
        ave_reward_list.append(ave_reward)
        reward_list = []

    if (i+1) % 100 == 0:    
        print('Episode {} Average Reward: {}'.format(i+1, ave_reward))

env.close()
    
ave_reward_list

Episode 100 Average Reward: -499.11
Episode 200 Average Reward: -500.0
Episode 300 Average Reward: -498.82
Episode 400 Average Reward: -499.94
Episode 500 Average Reward: -500.0
Episode 600 Average Reward: -500.0
Episode 700 Average Reward: -500.0
Episode 800 Average Reward: -500.0
Episode 900 Average Reward: -499.43
Episode 1000 Average Reward: -498.03
Episode 1100 Average Reward: -497.9
Episode 1200 Average Reward: -500.0
Episode 1300 Average Reward: -500.0
Episode 1400 Average Reward: -500.0
Episode 1500 Average Reward: -500.0
Episode 1600 Average Reward: -500.0
Episode 1700 Average Reward: -500.0
Episode 1800 Average Reward: -500.0
Episode 1900 Average Reward: -500.0
Episode 2000 Average Reward: -498.46
Episode 2100 Average Reward: -499.82
Episode 2200 Average Reward: -500.0
Episode 2300 Average Reward: -500.0
Episode 2400 Average Reward: -498.92
Episode 2500 Average Reward: -500.0
Episode 2600 Average Reward: -499.29
Episode 2700 Average Reward: -500.0
Episode 2800 Average Reward:

[-499.5653650254669,
 -498.04761904761904,
 -498.37,
 -497.98,
 -500.0,
 -499.54,
 -500.0,
 -497.95,
 -498.82,
 -498.87,
 -499.16,
 -499.42,
 -499.25,
 -498.08,
 -495.39,
 -499.77,
 -498.06,
 -499.18,
 -499.17,
 -498.28,
 -499.91,
 -499.0,
 -498.83,
 -499.82,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -499.39,
 -499.77,
 -499.78,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -498.35,
 -498.23,
 -498.49,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -499.49,
 -498.83,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -498.32,
 -499.94,
 -499.95,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -496.47,
 -498.94,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -496.72,
 -500.0,
 -500.0,
 -499.85,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -498.77,
 -499.97,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -499.11,
 -500.0,
 -498.82,
 

In [76]:
reward_list

[-500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 -500.0,
 