# Importing libraries

In [1]:
import numpy as np
import pickle
import os
import sys
import random

## Defining constants

In [2]:
STATE_SIZE = (25,25)
ACTION_SIZE = 4

## Defining loading and saving of files

In [3]:
def load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [4]:
def save(filename, Q_table):
    with open(filename, 'wb') as f:
        pickle.dump(Q_table, f)

## Defining the static environment



In [38]:
class DroneGridEnv():

    def __init__(self, grid):

        self.grid = grid
        self.grid_size = np.array(grid).shape
        self.observation_space = (self.grid_size[0]), (self.grid_size[1])
        self.action_space = [0, 1, 2, 3] # 4 discrete actions: 0 = up, 1 = down, 2 = left, 3 = right
        self.start_pos = (0, 0)  # Starting position at top left corner
        self.goal_pos = (self.grid_size[0] - 1, self.grid_size[1] - 1)  # Goal position at bottom right corner
        self.current_pos = self.start_pos  # Initialize current position

    def reset(self):
        self.current_pos = self.start_pos  # Reset current position to start position
        return self.current_pos  # Return initial state

    def step(self, action):

        assert action in self.action_space, f"Invalid action {action}"  # Check if action is valid

        # Define movement based on action
        if action == 0:  # Up
            new_pos = (self.current_pos[0], self.current_pos[1] - 1)
        elif action == 1:  # Down
            new_pos = (self.current_pos[0], self.current_pos[1] + 1)
        elif action == 2:  # Left
            new_pos = (self.current_pos[0] - 1, self.current_pos[1])
        elif action == 3:  # Right
            new_pos = (self.current_pos[0] + 1, self.current_pos[1])

        # Check if new position is within bounds and not an obstacle
        if 0 <= new_pos[0] < self.grid_size[0] and 0 <= new_pos[1] < self.grid_size[1] and self.grid[new_pos[1]][new_pos[0]] != 1:

            self.current_pos = new_pos  # Update current position

            # Check if goal state is reached
            done = (self.current_pos == self.goal_pos)

            # Calculate reward
            if done:
                reward = 100.0  # Positive reward for reaching the goal

            elif self.grid[new_pos[1]][new_pos[0]] == 1:
                reward = -300 # Negative reward for going in a wall
                done = True

            else:
                reward = 0 #Negative reward for non-goal state

        else:
            done = True
            reward = -100  # Negative reward for going out of bounds

        return self.current_pos, reward, done

Function to compute an action in function of the epsilon-greedy algorith

In [28]:
def compute_action(current_state, Q_table, epsilon, environment):

    if np.random.uniform(0,1) < epsilon:
        return np.random.choice(range(len(environment.action_space)))

    else:
        return np.argmax(Q_table[current_state])

---

### Rendering the final path on an image using pygame

---

Importing pygame and the create_map script

In [None]:
import pygame
import create_map

Loading a personalized map

In [29]:
map_simple = load('map_simple.pkl')
print(map_simple)

map_hard1 = load('map_hard1.pkl')
print(map_hard1)

map_hard2 = load('map_hard2.pkl')
print(map_hard2)

map_hard3 = load('map_hard3.pkl')
print(map_hard3)



[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Creating an instance of the environment through the loaded map

In [30]:
environment_simple = DroneGridEnv(map_simple)
print(environment_simple.observation_space)
print(environment_simple.action_space)

environment_hard1 = DroneGridEnv(map_hard1)
print(environment_hard1.observation_space)
print(environment_hard1.action_space)

environment_hard2 = DroneGridEnv(map_hard2)
print(environment_hard2.observation_space)
print(environment_hard2.action_space)

environment_hard3 = DroneGridEnv(map_hard3)
print(environment_hard3.observation_space)
print(environment_hard3.action_space)

(25, 25)
[0, 1, 2, 3]
(50, 50)
[0, 1, 2, 3]
(50, 50)
[0, 1, 2, 3]
(50, 50)
[0, 1, 2, 3]


---
# Q-Learning
---


In [42]:
def q_learning(env, alpha=1, gamma=0.8,  epsilon=1, epsilon_decay=0.9995):

    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s
    rewards = []

    for e in range(4000): #Run 1k training runs

        state = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0

        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state in function of the epsilon_greedy

            posp1, reward, done = env.step(action) #Send your action to OpenAI and get back the tuple

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0]

            total_reward += reward #Increment your reward

            Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma * np.max(Q[state_tp1_index]) - Q[current_state_index][action])

             #Make sure to keep random at 10%

            if done:
                print(f"Episode: {e}, Reward: {total_reward}")
                break
        if epsilon>0.1:
            epsilon *= epsilon_decay

        rewards.append(total_reward)

    return Q, rewards

Effective running of the Q-Learning and saving of the trained Q-Table

In [43]:
q_simple, _ = q_learning(environment_simple)
save('Simple - Q-Learning.pkl', q_simple)

q_hard1, _ = q_learning(environment_hard1)
save('Hard1 - Q-Learning.pkl', q_hard1)

'''
q_hard2, _ = q_learning(environment_hard2)
save('Hard2 - Q-Learning.pkl', q_hard2)

q_hard3, _ = q_learning(environment_hard3)
save('Hard3 - Q-Learning.pkl', q_hard3)'''

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Episode: 3000, Reward: 0.0
Episode: 3001, Reward: -400.0
Episode: 3002, Reward: -500.0
Episode: 3003, Reward: 0.0
Episode: 3004, Reward: -200.0
Episode: 3005, Reward: -100.0
Episode: 3006, Reward: -500.0
Episode: 3007, Reward: -100.0
Episode: 3008, Reward: -200.0
Episode: 3009, Reward: 0.0
Episode: 3010, Reward: -200.0
Episode: 3011, Reward: 0.0
Episode: 3012, Reward: -700.0
Episode: 3013, Reward: -500.0
Episode: 3014, Reward: -100.0
Episode: 3015, Reward: 0.0
Episode: 3016, Reward: -200.0
Episode: 3017, Reward: -200.0
Episode: 3018, Reward: 0.0
Episode: 3019, Reward: 0.0
Episode: 3020, Reward: 100.0
Episode: 3021, Reward: 0.0
Episode: 3022, Reward: -200.0
Episode: 3023, Reward: -100.0
Episode: 3024, Reward: -200.0
Episode: 3025, Reward: 100.0
Episode: 3026, Reward: 0.0
Episode: 3027, Reward: -200.0
Episode: 3028, Reward: -200.0
Episode: 3029, Reward: -100.0
Episode: 3030, Reward: 0.0
Episode: 3

"\nq_hard2, _ = q_learning(environment_hard2)\nsave('Hard2 - Q-Learning.pkl', q_hard2)\n\nq_hard3, _ = q_learning(environment_hard3)\nsave('Hard3 - Q-Learning.pkl', q_hard3)"

In [44]:
from google.colab import files

files.download('Simple - Q-Learning.pkl')
files.download('Hard1 - Q-Learning.pkl')
#files.download('Hard2 - Q-Learning.pkl')
#files.download('Hard3 - Q-Learning.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Loading Q-Learning trained Q-Table and checking if successful

In [15]:
trained_q_simple = load('Simple - Q-Learning.pkl')
print('simple', trained_q_simple)

trained_q_hard1 = load('Hard1 - Q-Learning.pkl')
print('hard1', trained_q_hard1)

trained_q_hard2 = load('Hard2 - Q-Learning.pkl')
print('hard2', trained_q_hard2)

trained_q_hard3 = load('Hard3 - Q-Learning.pkl')
print('hard3', trained_q_hard3)

simple [[-99.36373      0.70696515 -99.36373      0.70696515]
 [-99.29304      0.78551686   0.6362686    0.78551686]
 [-99.214485     0.8727965    0.70696515   0.8727965 ]
 ...
 [ 72.9        -19.          72.9         90.        ]
 [ 81.         -10.          81.         100.        ]
 [  0.           0.           0.           0.        ]]
hard1 [[-9.9996719e+01  2.9512672e-03 -9.9996719e+01  3.6435400e-03]
 [-9.9996353e+01  3.2791859e-03  3.2791859e-03  4.0483777e-03]
 [-9.9995949e+01  3.6435400e-03  3.6435400e-03  4.4981977e-03]
 ...
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00]
 [ 8.1000000e+01 -1.0000000e+01 -1.0000000e+01  1.0000000e+02]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00]]
hard2 [[-9.9996719e+01 -9.9996719e+01 -9.9996719e+01  3.6435400e-03]
 [-9.9996353e+01  4.0483777e-03  3.2791859e-03  4.0483777e-03]
 [-9.9995949e+01  4.4981977e-03  3.6435400e-03 -9.9995949e+01]
 ...
 [ 7.2900002e+01 -1.9000000e+01  7.2900002e+01  9.0000000e+01]
 [ 8

---

# SARSA

---

In [25]:
def sarsa(env, alpha=1, gamma=0.9,  epsilon=1, epsilon_decay=0.005):

    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s

    for e in range(1000): #Run 1k training runs

        state = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0

        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state using Q-Table

            posp1, reward, done = env.step(action) # Send the action to the environment and obtain the new position, the reward and the termination flag

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0] # Compute the index of the state at t+1
            action_tp1 = compute_action(state_tp1_index, Q, epsilon, env) # Compute the action for the next state using Q-Table

            total_reward += reward # Increment the reward

            Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma*Q[state_tp1_index][action_tp1] - Q[current_state_index][action])

             #Make sure to keep random at 10%

            if done:
                print(f"Episode: {e}, Reward: {total_reward}")
                break

        epsilon *= np.exp(-epsilon_decay)

    return Q

Effective running of SARSA and saving of the trained Q-Table

In [26]:
s_simple = sarsa(environment_simple)
save('Simple - SARSA.pkl', s_simple)

s_hard1 = sarsa(environment_hard1)
save('Hard1 - SARSA.pkl', s_hard1)

s_hard2 = sarsa(environment_hard2)
save('Hard2 - SARSA.pkl', s_hard2)

s_hard3 = sarsa(environment_hard3)
save('Hard3 - SARSA.pkl', s_hard3)

Episode: 0, Reward: -44000.0
Episode: 1, Reward: -13600.0
Episode: 2, Reward: -6500.0
Episode: 3, Reward: -78400.0
Episode: 4, Reward: -98000.0
Episode: 5, Reward: -123600.0
Episode: 6, Reward: -37600.0
Episode: 7, Reward: -21000.0
Episode: 8, Reward: -17700.0
Episode: 9, Reward: -20500.0
Episode: 10, Reward: -11800.0
Episode: 11, Reward: -2400.0
Episode: 12, Reward: -50000.0
Episode: 13, Reward: -18400.0
Episode: 14, Reward: -5800.0
Episode: 15, Reward: -3200.0
Episode: 16, Reward: -38100.0
Episode: 17, Reward: -63100.0
Episode: 18, Reward: -22600.0
Episode: 19, Reward: -41800.0
Episode: 20, Reward: -52500.0
Episode: 21, Reward: -47700.0
Episode: 22, Reward: -50400.0
Episode: 23, Reward: -14800.0
Episode: 24, Reward: -12600.0
Episode: 25, Reward: -6100.0
Episode: 26, Reward: -60100.0
Episode: 27, Reward: -4400.0
Episode: 28, Reward: -14500.0
Episode: 29, Reward: -20200.0
Episode: 30, Reward: -41500.0
Episode: 31, Reward: -25300.0
Episode: 32, Reward: -16000.0
Episode: 33, Reward: -420

KeyboardInterrupt: 

Loading SARSA trained Q-Table and checking if successful

In [86]:
trained_s_simple = load('Simple - SARSA.pkl')
print(trained_s_simple)

trained_s_hard1 = load('Hard1 - SARSA.pkl')
print(trained_s_hard1)

trained_s_hard2 = load('Hard2 - SARSA.pkl')
print(trained_s_hard2)

trained_s_hard3 = load('Hard3 - SARSA.pkl')
print(trained_s_hard3)

[[-100.         0.      -143.04672  -90.     ]
 [-100.       -81.       -72.9        0.     ]
 [-100.         0.       -59.049    -90.     ]
 ...
 [  72.9      -77.12321  -78.21871  -24.39   ]
 [  53.1441   -10.        65.61     100.     ]
 [   0.         0.         0.         0.     ]]


In [None]:
from google.colab importes files

files.download('Simple - SARSA.pkl')
files.download('Hard1 - SARSA.pkl')
files.download('Hard2 - SARSA.pkl')
files.download('Hard3 - SARSA.pkl')

---

# Alternating Q-Learning / SARSA

---

In [90]:
def alternating(env, alpha=1, gamma=0.9,  epsilon=1, epsilon_decay=0.005):

    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s

    for e in range(1000): #Run 1k training runs

        state = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0

        while True: #Loop until done == True

            random_num = random.random() # Generate a random number between 0 and 1

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state using Q-Table

            posp1, reward, done = env.step(action) # Send the action to the environment and obtain the new position, the reward and the termination flag

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0] # Compute the index of the state at t+1
            action_tp1 = compute_action(state_tp1_index, Q, epsilon, env) # Compute the action for the next state using Q-Table

            total_reward += reward # Increment the reward

            if (random_num <= 0.5): # We use Q-learning
                Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma * np.max(Q[state_tp1_index]) - Q[current_state_index][action])

            else: # We use SARSA
                Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma*Q[state_tp1_index][action_tp1] - Q[current_state_index][action])

            if done:
                print(f"Episode: {e}, Reward: {total_reward}")
                break

        epsilon *= np.exp(-epsilon_decay)

    return Q

Effective running of alternating Q-Learning/SARSA and saving of the trained Q-Table

In [92]:
a_simple = alternating(environment_simple)
save('Simple - Alternating.pkl', a_simple)

a_hard1 = alternating(environment_hard1)
save('Hard1 - Alternating.pkl', a_hard1)

a_hard2 = alternating(environment_hard2)
save('Hard2 - Alternating.pkl', a_hard2)

a_hard3 = alternating(environment_hard3)
save('Hard3 - Alternating.pkl', a_hard3)

Episode: 0, Reward: -25000.0
Episode: 1, Reward: -190600.0
Episode: 2, Reward: -9700.0
Episode: 3, Reward: -44500.0
Episode: 4, Reward: -23500.0
Episode: 5, Reward: -26500.0
Episode: 6, Reward: -103100.0
Episode: 7, Reward: -64000.0
Episode: 8, Reward: -224200.0
Episode: 9, Reward: -41500.0
Episode: 10, Reward: -4700.0
Episode: 11, Reward: -67500.0
Episode: 12, Reward: -9100.0
Episode: 13, Reward: -4500.0
Episode: 14, Reward: -8100.0
Episode: 15, Reward: -9400.0
Episode: 16, Reward: -3700.0
Episode: 17, Reward: -3600.0
Episode: 18, Reward: -5700.0
Episode: 19, Reward: -4200.0
Episode: 20, Reward: -19200.0
Episode: 21, Reward: -3600.0
Episode: 22, Reward: -4200.0
Episode: 23, Reward: -2300.0
Episode: 24, Reward: -7500.0
Episode: 25, Reward: -8300.0
Episode: 26, Reward: -3000.0
Episode: 27, Reward: -6300.0
Episode: 28, Reward: -3300.0
Episode: 29, Reward: -4700.0
Episode: 30, Reward: -1200.0
Episode: 31, Reward: -4000.0
Episode: 32, Reward: -4100.0
Episode: 33, Reward: -2500.0
Episode: 3

Loading alternating trained Q-Table and checking if successful

In [94]:
trained_a = load('Q_Table - Alternating.pkl')
print(trained_a)

[[-9.9363731e+01  7.0696515e-01 -9.9363731e+01 -8.9582550e+01]
 [-9.9536163e+01  7.7355430e-02 -8.9868996e+01  7.7355430e-02]
 [-9.9922646e+01  8.5950479e-02  8.5950479e-02  8.5950479e-02]
 ...
 [ 3.8742046e+01 -1.9000000e+01  7.2900002e+01  9.0000000e+01]
 [ 8.1000000e+01 -1.0000000e+01  8.1000000e+01  1.0000000e+02]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00]]


---

# Deep Q-Learning

---

---

# Spatial Computing for Path Planning - SCPP - Personalized algorithm

---