# Importing libraries

In [1]:
import numpy as np
import pickle
import os
import sys
import random

## Defining constants

In [2]:
STATE_SIZE = (10,10)
ACTION_SIZE = 4

## Defining loading and saving of files

In [3]:
def load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [4]:
def save(filename, Q_table):
    with open(filename, 'wb') as f:
        pickle.dump(Q_table, f)

## Defining the static environment



In [5]:
class DroneGrid():

    def __init__(self, grid):

        self.grid = grid
        self.empty_grid = grid
        self.grid_size = np.array(grid).shape
        self.observation_space = (self.grid_size[0]), (self.grid_size[1])
        self.action_space = [0, 1, 2, 3] # 4 discrete actions: 0 = up, 1 = down, 2 = left, 3 = right
        self.start_pos = (0, 0)  # Starting position at top left corner
        self.goal_pos = (self.grid_size[0] - 1, self.grid_size[1] - 1)  # Goal position at bottom right corner
        self.current_pos = self.start_pos  # Initialize current position
        self.grid[self.current_pos[1]][self.current_pos[0]] = 3

    def reset(self):
        self.current_pos = self.start_pos  # Reset current position to start position
        self.grid = self.empty_grid
        return self.current_pos, self.grid  # Return initial state


In [44]:
class QLEnvironment(DroneGrid):
    def __init__(self, grid):
        super().__init__(grid)

    def step(self, action):

        assert action in self.action_space, f"Invalid action {action}"  # Check if action is valid

        # Define movement based on action
        if action == 0:  # Up
            new_pos = (self.current_pos[0], self.current_pos[1] - 1)
        elif action == 1:  # Down
            new_pos = (self.current_pos[0], self.current_pos[1] + 1)
        elif action == 2:  # Left
            new_pos = (self.current_pos[0] - 1, self.current_pos[1])
        elif action == 3:  # Right
            new_pos = (self.current_pos[0] + 1, self.current_pos[1])

        # Check if new position is within bounds and not an obstacle
        if 0 <= new_pos[0] < self.grid_size[0] and 0 <= new_pos[1] < self.grid_size[1] and self.grid[new_pos[1]][new_pos[0]] != 1:

            self.current_pos = new_pos  # Update current position
            self.grid = self.empty_grid # Erase previous position of the drone

            # Check if goal state is reached
            done = (self.current_pos == self.goal_pos)

            # Calculate reward
            if done:
                reward = 1.0  # Positive reward for reaching the goal

            else:
                reward = 0 #Negative reward for non-goal state
                self.grid[new_pos[1]][new_pos[0]] = 3 # Update new position of the drone


        elif 0 <= new_pos[0] < self.grid_size[0] and 0 <= new_pos[1] < self.grid_size[1] and self.grid[new_pos[1]][new_pos[0]] == 1:
                done = False
                reward = 0 # Negative reward for going in a wall


        else:
            done = False
            reward = 0  # Negative reward for going out of bounds

        return self.current_pos, self.grid, reward, done

Function to compute an action in function of the epsilon-greedy algorith

In [7]:
def compute_action(current_state, Q_table, epsilon, environment):

    if np.random.uniform(0,1) < epsilon:
        return np.random.choice(range(len(environment.action_space)))

    else:
        return np.argmax(Q_table[current_state])

Loading a personalized map

In [8]:
map_simple = load('map_simple.pkl')
print(map_simple)

map_mid = load('map_mid.pkl')
print(map_mid)

map_hard = load('map_hard.pkl')
print(map_hard)

[[0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [1, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 1, 1, 1], [0, 1, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


Creating an instance of the environment through the loaded map

In [45]:
environment_simple = QLEnvironment(map_simple)
print(environment_simple.observation_space)
print(environment_simple.action_space)

environment_mid = QLEnvironment(map_mid)
print(environment_mid.observation_space)
print(environment_mid.action_space)

environment_hard = QLEnvironment(map_hard)
print(environment_hard.observation_space)
print(environment_hard.action_space)

(10, 10)
[0, 1, 2, 3]
(10, 10)
[0, 1, 2, 3]
(10, 10)
[0, 1, 2, 3]


---
# Q-Learning
---


In [46]:
def q_learning(env, alpha=0.9, gamma=0.9,  epsilon=0.99, epsilon_decay=0.0001, episodes = 10000, max_iter_episode = 100):

    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s
    rewards = []
    mean_reward_for_1k_episode = 0

    for e in range(episodes): #Run 1k training runs

        state, _ = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        iteration = 0

        if e % 1000 == 0:
            mean_reward_for_1k_episode = float(mean_reward_for_1k_episode / 1000)
            rewards.append(mean_reward_for_1k_episode)
            print(f"Episode: {e}, Mean reward: {mean_reward_for_1k_episode}, Epsilon: {epsilon}")
            mean_reward_for_1k_episode = 0


        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state in function of the epsilon_greedy

            posp1, _, reward, done = env.step(action) #Send your action to OpenAI and get back the tuple

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0]

            total_reward += reward #Increment your reward
            mean_reward_for_1k_episode += reward

            Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma * np.max(Q[state_tp1_index]) - Q[current_state_index][action])

             #Make sure to keep random at 10%

            if done:
                #print(f"Episode: {e}, Reward: {total_reward}, Epsilon: {epsilon}")
                break

            iteration += 1

            if iteration >= max_iter_episode:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break


        if epsilon>0.1:
            epsilon *= np.exp(-epsilon_decay)

        #rewards.append(total_reward)

    return Q, rewards

Effective running of the Q-Learning and saving of the trained Q-Table

In [57]:
q_simple, _ = q_learning(environment_simple, epsilon_decay = 0.0005, episodes = 5000)
save('Simple - Q-Learning.pkl', q_simple)

q_mid, _ = q_learning(environment_mid, episodes = 3000, epsilon_decay = 0.0003, max_iter_episode = 200)
save('Mid - Q-Learning.pkl', q_mid)

q_hard, _ = q_learning(environment_hard, episodes = 5000, epsilon_decay = 0.0001, max_iter_episode = 300)
save('Hard - Q-Learning.pkl', q_hard)



Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: 0.604, Epsilon: 0.6004653531155191
Episode: 2000, Mean reward: 0.998, Epsilon: 0.36420064675974256
Episode: 3000, Mean reward: 1.0, Epsilon: 0.220898858546959
Episode: 4000, Mean reward: 1.0, Epsilon: 0.1339819304042573
Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: 0.755, Epsilon: 0.7334100384749153
Episode: 2000, Mean reward: 1.0, Epsilon: 0.5433235197331082
Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: 0.154, Epsilon: 0.8957890438555989
Episode: 2000, Mean reward: 0.727, Epsilon: 0.810543445547201
Episode: 3000, Mean reward: 0.985, Epsilon: 0.7334100384749007
Episode: 4000, Mean reward: 1.0, Epsilon: 0.663616845575281


In [71]:
from google.colab import files

files.download('Simple - Q-Learning.pkl')
files.download('Mid - Q-Learning.pkl')
files.download('Hard - Q-Learning.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Loading Q-Learning trained Q-Table and checking if successful

In [19]:
trained_q_simple = load('Simple - Q-Learning.pkl')
print('simple', trained_q_simple)

#trained_q_mid = load('Mid - Q-Learning.pkl')
#print('mid', trained_q_mid)

#trained_q_hard = load('Hard - Q-Learning.pkl')
#print('hard', trained_q_hard)

simple [[0.15009463 0.16677181 0.15009463 0.16677181]
 [0.16677181 0.18530202 0.15009463 0.18530202]
 [0.18530202 0.20589113 0.16677181 0.20589113]
 [0.20589113 0.22876792 0.18530202 0.22876792]
 [0.22876792 0.25418657 0.20589113 0.25418657]
 [0.25418657 0.28242952 0.22876792 0.25418657]
 [0.         0.         0.         0.        ]
 [0.3138106  0.34867844 0.3138106  0.34867844]
 [0.34867844 0.38742048 0.3138106  0.38742048]
 [0.38742048 0.4304672  0.34867844 0.38742048]
 [0.15009463 0.18530202 0.16677181 0.18530202]
 [0.16677181 0.20589113 0.16677181 0.20589113]
 [0.18530202 0.20589113 0.18530202 0.22876792]
 [0.20589113 0.22876792 0.20589113 0.25418657]
 [0.22876792 0.28242952 0.22876792 0.28242952]
 [0.25418657 0.3138106  0.25418657 0.3138106 ]
 [0.3138106  0.34867844 0.28242952 0.34867844]
 [0.3138106  0.38742048 0.3138106  0.38742048]
 [0.34867844 0.4304672  0.34867844 0.4304672 ]
 [0.38742048 0.47829688 0.38742048 0.4304672 ]
 [0.16677181 0.20589113 0.18530202 0.20589113]
 [0.18

---

# SARSA

---

In [60]:
def sarsa(env, alpha=0.8, gamma=0.9,  epsilon=0.99, epsilon_decay=0.0005, episodes = 2000, max_iter_episode = 100):

    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s
    rewards = []
    mean_reward_for_1k_episode = 0

    for e in range(episodes): #Run 1k training runs

        state, _ = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        iteration = 0

        if e % 1000 == 0:
            mean_reward_for_1k_episode = float(mean_reward_for_1k_episode / 1000)
            rewards.append(mean_reward_for_1k_episode)
            print(f"Episode: {e}, Mean reward: {mean_reward_for_1k_episode}, Epsilon: {epsilon}")
            mean_reward_for_1k_episode = 0

        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state using Q-Table

            posp1, _, reward, done = env.step(action) # Send the action to the environment and obtain the new position, the reward and the termination flag

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0] # Compute the index of the state at t+1
            action_tp1 = compute_action(state_tp1_index, Q, epsilon, env) # Compute the action for the next state using Q-Table

            total_reward += reward # Increment the reward
            mean_reward_for_1k_episode += reward

            Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma*Q[state_tp1_index][action_tp1] - Q[current_state_index][action])

             #Make sure to keep random at 10%

            if done:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break

            iteration += 1

            if iteration >= max_iter_episode:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break

        if epsilon > 0.1:
            epsilon *= np.exp(-epsilon_decay)

    return Q, rewards

Effective running of SARSA and saving of the trained Q-Table

In [64]:
s_simple, _ = sarsa(environment_simple, epsilon_decay = 0.0002, episodes = 5001, gamma  = 0.99)
save('Simple - SARSA.pkl', s_simple)

s_mid, _ = sarsa(environment_mid, epsilon_decay = 0.0001, episodes = 6001, gamma = 0.99, max_iter_episode = 200)
save('Mid - SARSA.pkl', s_mid)

s_hard, _ = sarsa(environment_hard, epsilon_decay = 0.00005, episodes = 8000, max_iter_episode = 300)
save('Hard - SARSA.pkl', s_hard)

Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: 0.164, Epsilon: 0.8105434455472116
Episode: 2000, Mean reward: 0.553, Epsilon: 0.663616845575298
Episode: 3000, Mean reward: 0.854, Epsilon: 0.5433235197331054
Episode: 4000, Mean reward: 0.961, Epsilon: 0.44483567447607075
Episode: 5000, Mean reward: 0.994, Epsilon: 0.3642006467597492
Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: 0.347, Epsilon: 0.8957890438555989
Episode: 2000, Mean reward: 0.669, Epsilon: 0.810543445547201
Episode: 3000, Mean reward: 0.892, Epsilon: 0.7334100384749007
Episode: 4000, Mean reward: 0.972, Epsilon: 0.663616845575281
Episode: 5000, Mean reward: 0.994, Epsilon: 0.6004653531155044
Episode: 6000, Mean reward: 0.998, Epsilon: 0.5433235197330827
Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: 0.007, Epsilon: 0.9417171302556689
Episode: 2000, Mean reward: 0.022, Epsilon: 0.8957890438555278
Episode: 3000, Mean reward: 0.318, Epsilon: 0.85210

Loading SARSA trained Q-Table and checking if successful

In [65]:
trained_s_simple = load('Simple - SARSA.pkl')
print('simple', trained_s_simple)

trained_s_mid = load('Mid - SARSA.pkl')
print('mid', trained_s_mid)

trained_s_hard = load('Hard - SARSA.pkl')
print('hard',trained_s_hard)

simple [[0.6651668  0.6711016  0.67648506 0.64353186]
 [0.6702843  0.64424086 0.67784745 0.6598697 ]
 [0.65125084 0.65126467 0.65766484 0.6592029 ]
 [0.6315701  0.57606083 0.676229   0.56586254]
 [0.57473755 0.56312364 0.5674773  0.5678167 ]
 [0.57005876 0.5755015  0.5760961  0.5717389 ]
 [0.         0.         0.         0.        ]
 [0.44950172 0.4538672  0.4491796  0.61958146]
 [0.44380555 0.6367181  0.59559745 0.38081813]
 [0.48307768 0.512017   0.5663717  0.47522688]
 [0.6724536  0.673248   0.6740694  0.639313  ]
 [0.65860957 0.6983793  0.6567824  0.640837  ]
 [0.64522696 0.6479361  0.66133153 0.65385205]
 [0.5605307  0.6585772  0.66888285 0.5903315 ]
 [0.5926953  0.5795789  0.5403397  0.57996655]
 [0.5748859  0.6450872  0.5469083  0.581073  ]
 [0.58726645 0.57270694 0.55600363 0.566109  ]
 [0.44942838 0.70069796 0.67445165 0.53893965]
 [0.617715   0.74383557 0.5103356  0.6438113 ]
 [0.4841985  0.5155457  0.60856074 0.5131288 ]
 [0.66850656 0.77620876 0.6757953  0.7007479 ]
 [0.67

In [72]:
from google.colab import files

files.download('Simple - SARSA.pkl')
files.download('Mid - SARSA.pkl')
files.download('Hard - SARSA.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---

# Alternating Q-Learning / SARSA

---

In [66]:
def alternating(env, alpha=1, gamma=0.9,  epsilon=1, epsilon_decay=0.005, episodes = 2000, max_iter_episode = 200):

    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s
    rewards = []
    mean_reward_for_1k_episode = 0

    for e in range(episodes): #Run 1k training runs

        state, _ = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        iteration = 0

        if e % 1000 == 0:
            mean_reward_for_1k_episode = float(mean_reward_for_1k_episode / 1000)
            rewards.append(mean_reward_for_1k_episode)
            print(f"Episode: {e}, Mean reward: {mean_reward_for_1k_episode}, Epsilon: {epsilon}")
            mean_reward_for_1k_episode = 0

        while True: #Loop until done == True

            random_num = random.random() # Generate a random number between 0 and 1

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state using Q-Table

            posp1, _, reward, done = env.step(action) # Send the action to the environment and obtain the new position, the reward and the termination flag

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0] # Compute the index of the state at t+1
            action_tp1 = compute_action(state_tp1_index, Q, epsilon, env) # Compute the action for the next state using Q-Table

            total_reward += reward # Increment the reward
            mean_reward_for_1k_episode += reward

            if (random_num <= 0.5): # We use Q-learning
                Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma * np.max(Q[state_tp1_index]) - Q[current_state_index][action])

            else: # We use SARSA
                Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma*Q[state_tp1_index][action_tp1] - Q[current_state_index][action])

            if done:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break


            iteration += 1

            if iteration >= max_iter_episode:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break

        if epsilon > 0.1:
            epsilon *= np.exp(-epsilon_decay)

    return Q, rewards

Effective running of alternating Q-Learning/SARSA and saving of the trained Q-Table

In [70]:
a_simple, _ = alternating(environment_simple, epsilon_decay = 0.0003, episodes = 3001)
save('Simple - Alternating.pkl', a_simple)

a_mid, _ = alternating(environment_mid, epsilon_decay = 0.0003, episodes = 3001)
save('Mid - Alternating.pkl', a_mid)

a_hard, _ = alternating(environment_hard, epsilon_decay = 0.00008, episodes = 7001, max_iter_episode = 250)
save('Hard - Alternating.pkl', a_hard)

Episode: 0, Mean reward: 0.0, Epsilon: 1
Episode: 1000, Mean reward: 0.635, Epsilon: 0.7408182206817333
Episode: 2000, Mean reward: 0.97, Epsilon: 0.5488116360940487
Episode: 3000, Mean reward: 0.999, Epsilon: 0.40656965974062376
Episode: 0, Mean reward: 0.0, Epsilon: 1
Episode: 1000, Mean reward: 0.557, Epsilon: 0.7408182206817333
Episode: 2000, Mean reward: 0.957, Epsilon: 0.5488116360940487
Episode: 3000, Mean reward: 0.998, Epsilon: 0.40656965974062376
Episode: 0, Mean reward: 0.0, Epsilon: 1
Episode: 1000, Mean reward: 0.031, Epsilon: 0.9231163463866633
Episode: 2000, Mean reward: 0.129, Epsilon: 0.8521437889662616
Episode: 3000, Mean reward: 0.252, Epsilon: 0.7866278610666239
Episode: 4000, Mean reward: 0.484, Epsilon: 0.7261490370737775
Episode: 5000, Mean reward: 0.642, Epsilon: 0.6703200460357392
Episode: 6000, Mean reward: 0.744, Epsilon: 0.6187833918062509
Episode: 7000, Mean reward: 0.809, Epsilon: 0.5712090638489334


Loading alternating trained Q-Table and checking if successful

In [73]:
trained_a_simple = load('Simple - Alternating.pkl')
print(trained_a_simple)

trained_a_mid = load('Mid - Alternating.pkl')
print(trained_a_mid)

trained_a_hard = load('Hard - Alternating.pkl')
print(trained_a_hard)

[[0.06461082 0.0717898  0.06461082 0.0717898 ]
 [0.0717898  0.06461082 0.07976644 0.07976644]
 [0.0717898  0.06461082 0.05814974 0.07976644]
 [0.09847709 0.05233477 0.08862938 0.10941899]
 [0.09847709 0.12157665 0.09847709 0.10941899]
 [0.12157665 0.18530202 0.09847709 0.13508517]
 [0.         0.         0.         0.        ]
 [0.16677181 0.22876792 0.25418657 0.16677181]
 [0.13508517 0.13508517 0.25418657 0.0717898 ]
 [0.12157665 0.28242952 0.07976644 0.0717898 ]
 [0.06461082 0.15009463 0.0717898  0.07976644]
 [0.0717898  0.05814974 0.04710129 0.0717898 ]
 [0.0717898  0.0717898  0.04239116 0.05233477]
 [0.09847709 0.09847709 0.09847709 0.05814974]
 [0.12157665 0.12157665 0.12157665 0.18530202]
 [0.13508517 0.13508517 0.16677181 0.20589113]
 [0.20589113 0.20589113 0.12157665 0.25418657]
 [0.25418657 0.3138106  0.22876792 0.25418657]
 [0.13508517 0.28242952 0.18530202 0.22876792]
 [0.20589113 0.34867844 0.25418657 0.25418657]
 [0.0717898  0.15009463 0.05814974 0.05814974]
 [0.04239116 

In [74]:
from google.colab import files

files.download('Simple - Alternating.pkl')
files.download('Mid - Alternating.pkl')
files.download('Hard - Alternating.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---

# Deep Q-Learning

---

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

Checking availability of CUDA

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


Defining the neural network

In [29]:
class DeepQNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DeepQNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)

        return x

Defining the exploration-exploit function


In [30]:
def compute_action_torch(environment, epsilon, policy_net):

    if np.random.uniform(0,1) < epsilon:
        return np.random.choice(range(len(environment.action_space))) # Exploration

    else:
        current_state = torch.FloatTensor(environment.grid).unsqueeze(0)
        q = policy_net(current_state)
        return torch.argmax(q).item() # Exploit

Defining the optimizing function


In [31]:
def optimize(memory, policy_net, target_net, gamma, optimizer, batch_size = 32):

    if len(memory) < batch_size:
        return

    batch = random.sample(memory, batch_size)
    state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

    state_batch = torch.FloatTensor(np.array(state_batch))
    action_batch = torch.LongTensor(np.array(action_batch)).unsqueeze(1)
    reward_batch = torch.FloatTensor(np.array(reward_batch))
    next_state_batch = torch.FloatTensor(np.array(next_state_batch))
    done_batch = torch.FloatTensor(np.array(done_batch))

    # Compute Q-values for current states
    q_values = policy_net(state_batch).gather(1, action_batch).squeeze()

    # Compute target Q-values using the target network
    with torch.no_grad():
        max_next_q_values = target_net(next_state_batch).max(1)[0]
        target_q_values = reward_batch + gamma * max_next_q_values * (1 - done_batch)

    loss = nn.MSELoss()(q_values, target_q_values)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Defining the Deep Q-Learning function

# **IL FAUDRAIT ENVOYER LA GRILLE CONTENANT LA POSITION DU DRONE EN TANT QU'ETAT DANS LE RESEAU ET NON PAS SEULEMENT L'INDEX DE L'ETAT COMME POUR LE Q-LEARNING**

In [36]:
def deep_q_learning(env, memory, policy_net, target_net, optimizer, alpha=0.01, gamma=0.9,  epsilon=1, epsilon_decay=0.005, target_update_freq = 400, episodes = 1000):

    steps = 0
    rewards = []

    for e in range(episodes): #Run 1k training runs

        state = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        step_per_episode = 0

        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            #current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            current_state = np.copy(env.grid)

            action = compute_action_torch(env, epsilon, policy_net) # Compute the action for the current state in function of the epsilon_greedy

            posp1, new_state, reward, done = env.step(action)

            #state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0]

            memory.append((current_state, action, reward, new_state, done))

            total_reward += reward #Increment your reward

            optimize(memory, policy_net, target_net, gamma, optimizer)

            if steps % target_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())

            if done:
                print(f"Episode: {e}, Reward: {total_reward}, Steps in the episode: {step_per_episode}")
                break

            steps += 1
            step_per_episode += 1


        if epsilon>0.1:
            epsilon *= np.exp(-epsilon_decay)

        rewards.append(total_reward)

    return rewards, steps

Function to save the weights of the policy_net and the target_net

In [15]:
def save_weights(filename, neural_net):
    torch.save(neural_net.state_dict(), filename)

In [16]:
def load_weights(filename, blank_net):
    blank_net.load_state_dict(torch.load(filename))

Initialization of the agent

In [37]:
policy_net = DeepQNetwork(STATE_SIZE[0]*STATE_SIZE[1], 100, ACTION_SIZE)
target_net = DeepQNetwork(STATE_SIZE[0]*STATE_SIZE[1], 100, ACTION_SIZE)
target_net.load_state_dict(policy_net.state_dict())

learning_rate = 0.001
memory_size = 300

optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate)
memory = deque(maxlen = memory_size)

Running of the training

In [38]:
dql_simple = deep_q_learning(environment_simple, memory, policy_net, target_net, optimizer)
#dql_mid = deep_q_learning(environment_mid, memory, policy_net, target_net, optimizer)
#dql_hard = deep_q_learning(environment_hard, memory, policy_net, target_net, optimizer)

Episode: 0, Reward: 1.0, Steps in the episode: 1342
Episode: 1, Reward: 1.0, Steps in the episode: 262
Episode: 2, Reward: 1.0, Steps in the episode: 90
Episode: 3, Reward: 1.0, Steps in the episode: 349
Episode: 4, Reward: 1.0, Steps in the episode: 1219
Episode: 5, Reward: 1.0, Steps in the episode: 574
Episode: 6, Reward: 1.0, Steps in the episode: 495
Episode: 7, Reward: 1.0, Steps in the episode: 1049
Episode: 8, Reward: 1.0, Steps in the episode: 1572
Episode: 9, Reward: 1.0, Steps in the episode: 300
Episode: 10, Reward: 1.0, Steps in the episode: 777
Episode: 11, Reward: 1.0, Steps in the episode: 529
Episode: 12, Reward: 1.0, Steps in the episode: 764
Episode: 13, Reward: 1.0, Steps in the episode: 226
Episode: 14, Reward: 1.0, Steps in the episode: 2435
Episode: 15, Reward: 1.0, Steps in the episode: 451
Episode: 16, Reward: 1.0, Steps in the episode: 294
Episode: 17, Reward: 1.0, Steps in the episode: 153
Episode: 18, Reward: 1.0, Steps in the episode: 175
Episode: 19, Rewar

KeyboardInterrupt: 

---

# Spatial Computing for Path Planning - SCPP - Personalized algorithm

---

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

Checking availability of CUDA

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


Defining a new type of environment with a different step() method

In [None]:
class SCPPEnvironment(DroneGrid):
    def __init__(self, grid):
        super().__init__(grid)

    def step(self, action):
        pass

Defining the neural network

In [None]:
class NeuralAgent(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralAgent, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

Function to compute the input tensor during training

In [None]:
def calculate_input_tensor(env):
    # Get the dimensions of the grid
    height, width = env.grid.shape

    # Get the agent's position
    x, y = env.current_pos

    # Initialize the input tensor with zeros
    input_tensor = np.zeros(6)

    # Calculate the distance to the nearest obstacle in the left direction
    for i in range(x-1, -1, -1):
        if grid[i, y] == 1:
            input_tensor[0] = x - i
            break

    # Calculate the distance to the nearest obstacle in the right direction
    for i in range(x+1, height):
        if grid[i, y] == 1:
            input_tensor[1] = i - x
            break

    # Calculate the distance to the nearest obstacle in the up direction
    for j in range(y-1, -1, -1):
        if grid[x, j] == 1:
            input_tensor[2] = y - j
            break

    # Calculate the distance to the nearest obstacle in the down direction
    for j in range(y+1, width):
        if grid[x, j] == 1:
            input_tensor[3] = j - y
            break

    # Calculate the distance to the goal in the horizontal direction
    goal_position = np.argwhere(grid == 2)[0]
    input_tensor[4] = goal_position[1] - y

    # Calculate the distance to the goal in the vertical direction
    input_tensor[5] = goal_position[0] - x

    # Convert the input tensor to a PyTorch tensor
    input_tensor = torch.tensor(input_tensor, dtype=torch.float32).unsqueeze(0)

    return input_tensor

Definition of the training

In [None]:
def scpp(agent, env, num_episodes, criterion, optimizer):
    # Train the agent for a specified number of episodes
    for episode in range(num_episodes):
        # Reset the environment
        state = env.reset()

        # Initialize the episode reward
        episode_reward = 0

        # Loop through the episode
        while True:
            # Convert the state to a PyTorch tensor
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

            # Forward pass
            action_probs = agent(state_tensor)

            # Sample an action from the action probabilities
            action = torch.multinomial(action_probs, num_samples=1).item()

            # Take the action and observe the next state and reward
            next_state, reward, done, _ = env.step(action)

            # Update the episode reward
            episode_reward += reward

            # Convert the next state to a PyTorch tensor
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)

            # Calculate the target tensor
            with torch.no_grad():
                next_action_probs = agent(next_state_tensor)
                target = reward + 0.99 * torch.max(next_action_probs)

            # Calculate the loss
            loss = criterion(action_probs[0, action], target)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update the state
            state = next_state

            # Check if the episode is done
            if done:
                break

        # Print the episode reward every 100 episodes
        if (episode + 1) % 100 == 0:
            print(f'Episode [{episode+1}/{num_episodes}], Episode Reward: {episode_reward:.2f}')