# DQN: CartPole

### Import Libraries

In [21]:
from collections import deque
import random

import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import clone_model

In [2]:
import tensorflow
tensorflow.__version__

'2.9.1'

### Load Environment

In [22]:
env = gym.make("CartPole-v1")

### Play random

In [4]:
# env.reset()

# n_step = 0

# for _ in range(100):
#     env.render()
#     action = env.action_space.sample()
#     obs, rew, done, info = env.step(action)
#     time.sleep(0.1)
    
#     n_step += 1
    
#     if done:
#         env.reset()
#         print(f"Resettato dopo n.{n_step} step.")
#         n_step = 0
        
# env.close()

### Observations

* Cart Position (-4.8, 4.8)
* Cart Velocity $(-\infty \text{ to } +\infty)$
* Pole Angle (-0.418 to +0.418)
* Pole Angular Velocity  $(-\infty \text{ to } +\infty)$

### Actions

* 0 - Move to the left
* 1 - Move to the right

### How to play manually

In [5]:
# action = 0
# k = 0

# def key_press(k, mod):
    
#     """
#     Get the key press for gym
#     """
    
#     global action
#     if k == key.LEFT:
#         action = 0
#     if k == key.RIGHT:
#         action = 1
        

# env.reset()
# rewards = 0

# for _ in range(1000):    
#     env.render()
#     env.viewer.window.on_key_press = key_press
    
#     observation, reward, done, info = env.step(action)
    
#     rewards += 1
    
#     if done:
#         print(f"Punteggio raggiunto {rewards}")
#         break
    
#     time.sleep(0.5)
    
# env.close()

### Build the Neural Network

In [6]:
num_actions = env.action_space.n
num_observation = env.observation_space.shape[0]

print(f"Ci sono n.{num_actions} azioni e n.{num_observation} osservazioni.")

Ci sono n.2 azioni e n.4 osservazioni.


In [23]:
model = Sequential()

model.add(Dense(16, input_shape=(1, num_observation), activation='relu'))
model.add(Dense(32, activation='relu'))

model.add(Dense(num_actions))

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 1, 16)             80        
                                                                 
 dense_4 (Dense)             (None, 1, 32)             544       
                                                                 
 dense_5 (Dense)             (None, 1, 2)              66        
                                                                 
Total params: 690
Trainable params: 690
Non-trainable params: 0
_________________________________________________________________
None


Il modello, dunque, prende come input l'osservazione e restituisce come output un valore per ogni azione. Maggiore è il valore e maggiormente l'azione sarà appropriata per quella particolare osservazione.
 
Le DQN lavorano meglio quando usate con una rete target.

In [8]:
target_model = clone_model(model)

### Hyperparameters and Update Functions

In [9]:
EPOCHS = 10

epsilon = 1.0
EPSILON_REDUCE = 0.995  # E' moltiplicata con epsilon ad ogni step
LEARNING_RATE = 0.001   # NON CORRISPONDE AL VALORE ALPHA DEL Q-LEARNING
GAMMA = 0.95

In [10]:
def epsilon_greedy_action_selection(model, epsilon, observation):
    
    random_number = np.random.random()
    
    if random_number > epsilon:                     # Scelta dell'azione appresa
        
        prediction = model.predict(observation.reshape([1, 1,4])) 
        action = np.argmax(prediction)              # Restituisce l'indice con il valore maggiore
    
    else:
        action = np.random.randint(0, env.action_space.n)
    
    return action

#### Replay Buffer

Per l'addestramento del modello è necessario un **replay buffer*** da usare come contenitore di dati. Il buffer non avrà valore infinito, avrà un valore finito che si aggiorenrà attraverso il metodo FIFO (First-In, First-Out). Per questo scopo è utile il _deque_.

In [11]:
deque_1 = deque(maxlen=5)

for i in range(10):
    deque_1.append(i)
    print(deque_1)

deque([0], maxlen=5)
deque([0, 1], maxlen=5)
deque([0, 1, 2], maxlen=5)
deque([0, 1, 2, 3], maxlen=5)
deque([0, 1, 2, 3, 4], maxlen=5)
deque([1, 2, 3, 4, 5], maxlen=5)
deque([2, 3, 4, 5, 6], maxlen=5)
deque([3, 4, 5, 6, 7], maxlen=5)
deque([4, 5, 6, 7, 8], maxlen=5)
deque([5, 6, 7, 8, 9], maxlen=5)


In [12]:
MAX_REPLAY_BUFFER = 20000
UPDATE_TARGET_MODEL_STEPS = 10

replay_buffer = deque(maxlen=MAX_REPLAY_BUFFER)
update_target_model = UPDATE_TARGET_MODEL_STEPS

#### Semplice tecnica per gestire i dati per l'action replay algorithm.

Utilizzo di * ovvero **Unpacking Argument Lists**

In [13]:
test_tuple = [(1,2,3), (4,5,6), (7,8,9)]
zipped_list = list(zip(*test_tuple))

In [14]:
a, b, c = zipped_list

In [15]:
print(a, b, c)

(1, 4, 7) (2, 5, 8) (3, 6, 9)


In [16]:
def replay(replay_buffer, batch_size, model, target_model):
    
    # Non restituisce nulla se il replay_buffer è inferiore al batch_size
    if len(replay_buffer) < batch_size:
        return
    
    samples = random.sample(replay_buffer, batch_size)
    
    # Conserva i valori target previsti dalla rete target
    target_batch = []
    
    zipped_samples = list(zip(*samples))
    states, actions, rewards, new_states, dones = zipped_samples
    
    # Previsione dei target per tutti gli stati del campione
    targets = target_model.predict(np.array(states))
    
    # Previsione del Q-Value per tutti i nuovi stati (new_states) del campione
    q_values = model.predict(np.array(new_states))
    
    
    # Loop attraverso tutti i valori previsti per calcolare i target attuali
    for i in range(batch_size):
        
        # Max Q-Value per ogni campione
        q_value = max(q_values[i][0])
        
#         print(f"q_values: {q_values[i]}")
        
        # Conservare l'i-esimo target per aggiornarlo in accordo alla formula
        target = targets[i].copy()
        
#         print(f"dones: {dones}")
        
        if dones[i]:
            target[0][actions[i]] = rewards[i]
        else:
            target[0][actions[i]] = rewards[i] + q_value * GAMMA
        
#         print(f"target: {target}")
        
        target_batch.append(target)
    
    # Fit the model
    model.fit(np.array(states), np.array(target_batch), epochs=1, verbose=0)

In [17]:
def update_model_handler(epoch, update_target_model, model, target_model):
    if epoch > 0 and epoch % update_target_model == 0:
        target_model.set_weights(model.get_weights())

### Training the Model

In [24]:
model.compile(loss='mse', optimizer=Adam(lr=LEARNING_RATE))

In [19]:
best_so_far = 0

for epoch in range(EPOCHS):
    observation = env.reset()
    
    print(f"epoch: {epoch}")
    
    observation = observation.reshape([1,4])
    done = False
    
    points = 0
    
    while not done:
        
        action = epsilon_greedy_action_selection(model, epsilon, observation)
        
        #######################################
        
        next_observation, reward, done, info = env.step(action)
        next_observation = next_observation.reshape([1,4])
        
        replay_buffer.append((observation, action, reward, next_observation, done))
        
        observation = next_observation
        points += 1
        
        #######################################
        
        # Training model by replaying
        replay(replay_buffer, 64, model, target_model)
        
        # Reduce epsilon
        epsilon *= EPSILON_REDUCE
        
        update_model_handler(epoch, update_target_model, model, target_model)
        
        if points > best_so_far:
            best_so_far = points
        if epoch % 5 == 0:
            print(f"{epoch}: Points reached: {points} - epsilon: {epsilon} - Best: {best_so_far}")
            
        if points == 100:
            done = True

epoch: 0
0: Points reached: 1 - epsilon: 0.995 - Best: 1
0: Points reached: 2 - epsilon: 0.990025 - Best: 2
0: Points reached: 3 - epsilon: 0.985074875 - Best: 3
0: Points reached: 4 - epsilon: 0.9801495006250001 - Best: 4
0: Points reached: 5 - epsilon: 0.9752487531218751 - Best: 5
0: Points reached: 6 - epsilon: 0.9703725093562657 - Best: 6
0: Points reached: 7 - epsilon: 0.9655206468094844 - Best: 7
0: Points reached: 8 - epsilon: 0.960693043575437 - Best: 8
0: Points reached: 9 - epsilon: 0.9558895783575597 - Best: 9
0: Points reached: 10 - epsilon: 0.9511101304657719 - Best: 10
0: Points reached: 11 - epsilon: 0.946354579813443 - Best: 11
0: Points reached: 12 - epsilon: 0.9416228069143757 - Best: 12
0: Points reached: 13 - epsilon: 0.9369146928798039 - Best: 13
0: Points reached: 14 - epsilon: 0.9322301194154049 - Best: 14
0: Points reached: 15 - epsilon: 0.9275689688183278 - Best: 15
0: Points reached: 16 - epsilon: 0.9229311239742362 - Best: 16
0: Points reached: 17 - epsilon: 

5: Points reached: 6 - epsilon: 0.5647174463480732 - Best: 42
5: Points reached: 7 - epsilon: 0.5618938591163328 - Best: 42
5: Points reached: 8 - epsilon: 0.5590843898207511 - Best: 42
5: Points reached: 9 - epsilon: 0.5562889678716474 - Best: 42
5: Points reached: 10 - epsilon: 0.5535075230322891 - Best: 42
5: Points reached: 11 - epsilon: 0.5507399854171277 - Best: 42
5: Points reached: 12 - epsilon: 0.547986285490042 - Best: 42
5: Points reached: 13 - epsilon: 0.5452463540625918 - Best: 42
5: Points reached: 14 - epsilon: 0.5425201222922789 - Best: 42
5: Points reached: 15 - epsilon: 0.5398075216808175 - Best: 42
5: Points reached: 16 - epsilon: 0.5371084840724134 - Best: 42
5: Points reached: 17 - epsilon: 0.5344229416520513 - Best: 42
5: Points reached: 18 - epsilon: 0.531750826943791 - Best: 42
5: Points reached: 19 - epsilon: 0.5290920728090721 - Best: 42
5: Points reached: 20 - epsilon: 0.5264466124450268 - Best: 42
epoch: 6
epoch: 7
epoch: 8
epoch: 9




<!-- $$ asdasd $$ -->

In [20]:
# observation = env.reset()

# for counter in range(300):
#     env.render()
    
#     action = np.argmax(model.predict(observation.reshape([1, 1, 4])))
    
#     observation, reward, done, info = env.step(action)
    
#     if done:
#         print(f"done {counter}")
#         break

# env.close()