# Importing libraries

In [1]:
import numpy as np
import pickle
import os
import sys
import random
import time

## Defining constants

In [2]:
STATE_SIZE = (10,10)
ACTION_SIZE = 4

## Defining loading and saving of files

In [3]:
def load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [4]:
def save(filename, Q_table):
    with open(filename, 'wb') as f:
        pickle.dump(Q_table, f)

## Defining the static environment



In [91]:
class DroneGrid():

    def __init__(self, grid):

        self.grid = grid
        self.empty_grid = grid
        self.grid_size = np.array(grid).shape
        self.observation_space = (self.grid_size[0]), (self.grid_size[1])
        self.action_space = [0, 1, 2, 3] # 4 discrete actions: 0 = up, 1 = down, 2 = left, 3 = right
        self.start_pos = (0, 0)  # Starting position at top left corner
        self.goal_pos = (self.grid_size[0] - 1, self.grid_size[1] - 1)  # Goal position at bottom right corner
        self.current_pos = self.start_pos  # Initialize current position
        self.grid[self.current_pos[1]][self.current_pos[0]] = 3

    def reset(self):
        self.current_pos = self.start_pos  # Reset current position to start position
        self.grid = self.empty_grid
        return self.current_pos, self.grid  # Return initial state


In [92]:
class QLEnvironment(DroneGrid):
    def __init__(self, grid):
        super().__init__(grid)

    def step(self, action):

        assert action in self.action_space, f"Invalid action {action}"  # Check if action is valid

        # Define movement based on action
        if action == 0:  # Up
            new_pos = (self.current_pos[0], self.current_pos[1] - 1)
        elif action == 1:  # Down
            new_pos = (self.current_pos[0], self.current_pos[1] + 1)
        elif action == 2:  # Left
            new_pos = (self.current_pos[0] - 1, self.current_pos[1])
        elif action == 3:  # Right
            new_pos = (self.current_pos[0] + 1, self.current_pos[1])

        # Check if new position is within bounds and not an obstacle
        if 0 <= new_pos[0] < self.grid_size[0] and 0 <= new_pos[1] < self.grid_size[1] and self.grid[new_pos[1]][new_pos[0]] != 1:

            self.current_pos = new_pos  # Update current position
            self.grid = self.empty_grid # Erase previous position of the drone

            # Check if goal state is reached
            done = (self.current_pos == self.goal_pos)

            # Calculate reward
            if done:
                reward = 1.0  # Positive reward for reaching the goal

            else:
                reward = 0 #Negative reward for non-goal state
                self.grid[new_pos[1]][new_pos[0]] = 3 # Update new position of the drone


        elif 0 <= new_pos[0] < self.grid_size[0] and 0 <= new_pos[1] < self.grid_size[1] and self.grid[new_pos[1]][new_pos[0]] == 1:
                done = False
                reward = -0.1 # Negative reward for going in a wall


        else:
            done = False
            reward = 0  # Negative reward for going out of bounds

        return self.current_pos, self.grid, reward, done

Function to compute an action in function of the epsilon-greedy algorith

In [7]:
def compute_action(current_state, Q_table, epsilon, environment):

    if np.random.uniform(0,1) < epsilon:
        return np.random.choice(range(len(environment.action_space)))

    else:
        return np.argmax(Q_table[current_state])

Loading a personalized map

In [93]:
map_simple = load('map_simple.pkl')
print(map_simple)

map_mid = load('map_mid.pkl')
print(map_mid)

map_hard = load('map_hard.pkl')
print(map_hard)

[[0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [1, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 1, 1, 1], [0, 1, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


Creating an instance of the environment through the loaded map

In [94]:
environment_simple = QLEnvironment(map_simple)
print(environment_simple.observation_space)
print(environment_simple.action_space)

environment_mid = QLEnvironment(map_mid)
print(environment_mid.observation_space)
print(environment_mid.action_space)

environment_hard = QLEnvironment(map_hard)
print(environment_hard.observation_space)
print(environment_hard.action_space)

(10, 10)
[0, 1, 2, 3]
(10, 10)
[0, 1, 2, 3]
(10, 10)
[0, 1, 2, 3]


---
# Q-Learning
---


In [None]:
def q_learning(env, alpha=1, gamma=0.99,  epsilon=0.99, epsilon_decay=0.00025, episodes = 10001, max_iter_episode = 500):
    start_time = time.time()
    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s
    rewards = []
    mean_reward_for_1k_episode = 0

    for e in range(episodes): #Run 1k training runs

        state, _ = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        iteration = 0

        if e % 1000 == 0:
            mean_reward_for_1k_episode = float(mean_reward_for_1k_episode / 1000)
            rewards.append(mean_reward_for_1k_episode)
            print(f"Episode: {e}, Mean reward: {mean_reward_for_1k_episode}, Epsilon: {epsilon}")
            mean_reward_for_1k_episode = 0


        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state in function of the epsilon_greedy

            posp1, _, reward, done = env.step(action) #Send your action to OpenAI and get back the tuple

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0]

            total_reward += reward #Increment your reward
            mean_reward_for_1k_episode += reward

            Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma * np.max(Q[state_tp1_index]) - Q[current_state_index][action])

             #Make sure to keep random at 10%

            if done:
                #print(f"Episode: {e}, Reward: {total_reward}, Epsilon: {epsilon}")
                break

            iteration += 1

            if iteration >= max_iter_episode:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break


        if epsilon>0.1:
            epsilon *= np.exp(-epsilon_decay)

        rewards.append(total_reward)

    delta_time = time.time() - start_time
    print(f"Time: {delta_time}")

    return Q, rewards, delta_time

Effective running of the Q-Learning and saving of the trained Q-Table

In [None]:
q_simple, rewards_q_simple, time_q_simple = q_learning(environment_simple)
save('Trajectory - Simple - Q-Learning.pkl', q_simple)
save('Rewards - Simple - Q-Learning.pkl', rewards_q_simple)
save('Time - Simple - Q-Learning.pkl', time_q_simple)

q_mid, rewards_q_mid, time_q_mid = q_learning(environment_mid)
save('Trajectory - Mid - Q-Learning.pkl', q_mid)
save('Rewards - Mid - Q-Learning.pkl', rewards_q_mid)
save('Time - Mid - Q-Learning.pkl', time_q_mid)

q_hard, rewards_q_hard, time_q_hard = q_learning(environment_hard)
save('Trajectory - Hard - Q-Learning.pkl', q_hard)
save('Rewards - Hard - Q-Learning.pkl', rewards_q_hard)
save('Time - Hard - Q-Learning.pkl', time_q_hard)



Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: 0.04600000000003276, Epsilon: 0.7710127752407178
Episode: 2000, Mean reward: 0.8229999999999797, Epsilon: 0.60046535311555
Episode: 3000, Mean reward: 0.9215999999999895, Epsilon: 0.467642887213655
Episode: 4000, Mean reward: 0.9551999999999933, Epsilon: 0.36420064675978037
Episode: 5000, Mean reward: 0.9702999999999958, Epsilon: 0.28363974889163973
Episode: 6000, Mean reward: 0.979499999999997, Epsilon: 0.22089885854699348
Episode: 7000, Mean reward: 0.9872999999999982, Epsilon: 0.1720362040159844
Episode: 8000, Mean reward: 0.9910999999999985, Epsilon: 0.1339819304042855
Episode: 9000, Mean reward: 0.9934999999999989, Epsilon: 0.10434523231627953
Episode: 10000, Mean reward: 0.9943999999999992, Epsilon: 0.09997847803039572
Time: 17.49876308441162
Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: -1.3165999999993647, Epsilon: 0.7710127752407178
Episode: 2000, Mean reward: 0.4241999999999832, 

In [None]:
from google.colab import files

files.download('Trajectory - Simple - Q-Learning.pkl')
files.download('Trajectory - Mid - Q-Learning.pkl')
files.download('Trajectory - Hard - Q-Learning.pkl')

files.download('Rewards - Simple - Q-Learning.pkl')
files.download('Rewards - Mid - Q-Learning.pkl')
files.download('Rewards - Hard - Q-Learning.pkl')

files.download('Time - Simple - Q-Learning.pkl')
files.download('Time - Mid - Q-Learning.pkl')
files.download('Time - Hard - Q-Learning.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Loading Q-Learning trained Q-Table and checking if successful

In [None]:
trained_q_simple = load('Trajectorty - Simple - Q-Learning.pkl')
rewards_q_simple = load('Rewards - Simple - Q-Learning.pkl')
time_q_simple = load('Time - Simple - Q-Learning.pkl')
#print('simple', trained_q_simple)
print('rewards simple', len(rewards_q_simple))
print('time simple', time_q_simple)

#trained_q_mid = load('Mid - Q-Learning.pkl')
#print('mid', trained_q_mid)
rewards_q_mid = load('Rewards - Mid - Q-Learning.pkl')
time_q_mid = load('Time - Mid - Q-Learning.pkl')

print('rewards mid', len(rewards_q_mid))
print('time mid', time_q_mid)

#trained_q_hard = load('Hard - Q-Learning.pkl')
#print('hard', trained_q_hard)
rewards_q_hard = load('Rewards - Hard - Q-Learning.pkl')
time_q_hard = load('Time - Hard - Q-Learning.pkl')

print('rewards hard', len(rewards_q_hard))
print('time hard', time_q_hard)

rewards simple 15017
time simple 21.7232449054718
rewards mid 15017
time mid 22.858484029769897
rewards hard 15017
time hard 35.07642364501953


---

# SARSA

---

In [None]:
def sarsa(env, alpha=0.9, gamma=0.9,  epsilon=1, epsilon_decay=0.00025, episodes = 10001, max_iter_episode = 500):
    start_time = time.time()
    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s
    rewards = []
    mean_reward_for_1k_episode = 0

    for e in range(episodes): #Run 1k training runs

        state, _ = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        iteration = 0

        if e % 1000 == 0:
            mean_reward_for_1k_episode = float(mean_reward_for_1k_episode / 1000)
            rewards.append(mean_reward_for_1k_episode)
            print(f"Episode: {e}, Mean reward: {mean_reward_for_1k_episode}, Epsilon: {epsilon}")
            mean_reward_for_1k_episode = 0

        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state using Q-Table

            posp1, _, reward, done = env.step(action) # Send the action to the environment and obtain the new position, the reward and the termination flag

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0] # Compute the index of the state at t+1
            action_tp1 = compute_action(state_tp1_index, Q, epsilon, env) # Compute the action for the next state using Q-Table

            total_reward += reward # Increment the reward
            mean_reward_for_1k_episode += reward

            Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma*Q[state_tp1_index][action_tp1] - Q[current_state_index][action])

             #Make sure to keep random at 10%

            if done:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break

            iteration += 1

            if iteration >= max_iter_episode:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break

        if epsilon > 0.1:
            epsilon *= np.exp(-epsilon_decay)

        rewards.append(total_reward)

    delta_time = time.time() - start_time
    print(f"Time: {delta_time}")
    return Q, rewards, delta_time

Effective running of SARSA and saving of the trained Q-Table

In [None]:
s_simple, rewards_s_simple, time_s_simple = sarsa(environment_simple)
save('Trajectory - Simple - SARSA.pkl', s_simple)
save('Rewards - Simple - SARSA.pkl', rewards_s_simple)
save('Time - Simple - SARSA.pkl', time_s_simple)

s_mid, rewards_s_mid, time_s_mid = sarsa(environment_mid)
save('Trajectory - Mid - SARSA.pkl', s_mid)
save('Rewards - Mid - SARSA.pkl', rewards_s_mid)
save('Time - Mid - SARSA.pkl', time_s_mid)

s_hard, rewards_s_hard, time_s_hard = sarsa(environment_hard)
save('Trajectory - Hard - SARSA.pkl', s_hard)
save('Rewards - Hard - SARSA.pkl', rewards_s_hard)
save('Time - Hard - SARSA.pkl', time_s_hard)

Episode: 0, Mean reward: 0.0, Epsilon: 1
Episode: 1000, Mean reward: -1.0357000000001755, Epsilon: 0.7788007830714335
Episode: 2000, Mean reward: 0.014000000000024236, Epsilon: 0.6065306597126773
Episode: 3000, Mean reward: 0.3898999999999906, Epsilon: 0.4723665527410655
Episode: 4000, Mean reward: 0.6622999999999706, Epsilon: 0.36787944117149507
Episode: 5000, Mean reward: 0.7655999999999777, Epsilon: 0.2865047968602415
Episode: 6000, Mean reward: 0.8072999999999805, Epsilon: 0.22313016014847828
Episode: 7000, Mean reward: 0.8397999999999844, Epsilon: 0.17377394345048924
Episode: 8000, Mean reward: 0.9152999999999925, Epsilon: 0.13533528323665214
Episode: 9000, Mean reward: 0.8784999999999931, Epsilon: 0.10539922456189901
Episode: 10000, Mean reward: 0.8834999999999935, Epsilon: 0.0999835106590795
Time: 72.65182423591614
Episode: 0, Mean reward: 0.0, Epsilon: 1
Episode: 1000, Mean reward: -3.4005999999975582, Epsilon: 0.7788007830714335
Episode: 2000, Mean reward: -1.6532999999993925,

Loading SARSA trained Q-Table and checking if successful

In [None]:
trained_s_simple = load('Trajectory - Simple - SARSA.pkl')
print('simple', trained_s_simple)

trained_s_mid = load('Trajectory - Mid - SARSA.pkl')
print('mid', trained_s_mid)

trained_s_hard = load('Trajectory - Hard - SARSA.pkl')
print('hard',trained_s_hard)

simple [[0.44567177 0.44728616 0.44286552 0.81373316]
 [0.4501721  0.6978284  0.4445712  0.70594054]
 [0.44446135 0.68351907 0.44634077 0.69530284]
 [0.6652588  0.67394763 0.44448787 0.5021277 ]
 [0.504806   0.68046397 0.5083074  0.5097628 ]
 [0.5007196  0.4999843  0.8650847  0.3661518 ]
 [0.         0.         0.         0.        ]
 [0.6634075  0.6457142  0.47332165 0.8003355 ]
 [0.66966087 0.80144984 0.67579293 0.66756135]
 [0.64987856 0.9199013  0.6313867  0.64306194]
 [0.44743183 0.4493168  0.44489297 0.4392594 ]
 [0.70790976 0.39580634 0.44302082 0.43136746]
 [0.68225974 0.40249524 0.44756815 0.5032215 ]
 [0.67955256 0.40019432 0.6793649  0.6803811 ]
 [0.68197215 0.6691524  0.6737774  0.67031896]
 [0.506075   0.6755831  0.67561656 0.6654265 ]
 [0.65725535 0.67022943 0.528631   0.8997505 ]
 [0.47207808 0.85975456 0.6521792  0.65015733]
 [0.682654   0.65441257 0.6669527  0.6702138 ]
 [0.67464066 0.8960379  0.64969885 0.68135995]
 [0.4472489  0.4395988  0.4448456  0.4194471 ]
 [0.39

In [None]:
from google.colab import files

files.download('Trajectory - Simple - SARSA.pkl')
files.download('Trajectory - Mid - SARSA.pkl')
files.download('Trajectory - Hard - SARSA.pkl')

files.download('Rewards - Simple - SARSA.pkl')
files.download('Rewards - Mid - SARSA.pkl')
files.download('Rewards - Hard - SARSA.pkl')

files.download('Time - Simple - SARSA.pkl')
files.download('Time - Mid - SARSA.pkl')
files.download('Time - Hard - SARSA.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---

# Alternating Q-Learning / SARSA

---

In [None]:
def alternating(env, alpha=1, gamma=0.9,  epsilon=0.99, epsilon_decay=0.00025, episodes = 10001, max_iter_episode = 500):
    start_time = time.time()
    Q = np.zeros((env.grid_size[0]*env.grid_size[1], len(env.action_space)), dtype=np.float32) #Initialize the Q table to all 0s
    rewards = []
    mean_reward_for_1k_episode = 0

    for e in range(episodes): #Run 1k training runs

        state, _ = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        iteration = 0

        if e % 1000 == 0:
            mean_reward_for_1k_episode = float(mean_reward_for_1k_episode / 1000)
            rewards.append(mean_reward_for_1k_episode)
            print(f"Episode: {e}, Mean reward: {mean_reward_for_1k_episode}, Epsilon: {epsilon}")
            mean_reward_for_1k_episode = 0

        while True: #Loop until done == True

            random_num = random.random() # Generate a random number between 0 and 1

            current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            action = compute_action(current_state_index, Q, epsilon, env) # Compute the action for the current state using Q-Table

            posp1, _, reward, done = env.step(action) # Send the action to the environment and obtain the new position, the reward and the termination flag

            state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0] # Compute the index of the state at t+1
            action_tp1 = compute_action(state_tp1_index, Q, epsilon, env) # Compute the action for the next state using Q-Table

            total_reward += reward # Increment the reward
            mean_reward_for_1k_episode += reward

            if (random_num <= 0.5): # We use Q-learning
                Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma * np.max(Q[state_tp1_index]) - Q[current_state_index][action])

            else: # We use SARSA
                Q[current_state_index][action] = Q[current_state_index][action] + alpha * (reward + gamma*Q[state_tp1_index][action_tp1] - Q[current_state_index][action])

            if done:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break


            iteration += 1

            if iteration >= max_iter_episode:
                #print(f"Episode: {e}, Reward: {total_reward}")
                break

        if epsilon > 0.1:
            epsilon *= np.exp(-epsilon_decay)


        rewards.append(total_reward)

    delta_time = time.time() - start_time
    print(f"Time: {delta_time}")

    return Q, rewards, delta_time

Effective running of alternating Q-Learning/SARSA and saving of the trained Q-Table

In [None]:
a_simple, rewards_a_simple, time_a_simple = alternating(environment_simple)
save('Trajectory - Simple - Alternating.pkl', a_simple)
save('Rewards - Simple - Alternating.pkl', rewards_a_simple)
save('Time - Simple - Alternating.pkl', time_a_simple)

a_mid, rewards_a_mid, time_a_mid = alternating(environment_mid)
save('Trajectory - Mid - Alternating.pkl', a_mid)
save('Rewards - Mid - Alternating.pkl', rewards_a_mid)
save('Time - Mid - Alternating.pkl', time_a_mid)

a_hard, rewards_a_hard, time_a_hard = alternating(environment_hard)
save('Trajectort - Hard - Alternating.pkl', a_hard)
save('Rewards - Hard - Alternating.pkl', rewards_a_hard)
save('Time - Hard - Alternating.pkl', time_a_hard)

Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: -0.3400000000001813, Epsilon: 0.7710127752407178
Episode: 2000, Mean reward: 0.6193999999999744, Epsilon: 0.60046535311555
Episode: 3000, Mean reward: 0.793299999999979, Epsilon: 0.467642887213655
Episode: 4000, Mean reward: 0.8805999999999828, Epsilon: 0.36420064675978037
Episode: 5000, Mean reward: 0.9251999999999906, Epsilon: 0.28363974889163973
Episode: 6000, Mean reward: 0.9428999999999921, Epsilon: 0.22089885854699348
Episode: 7000, Mean reward: 0.9587999999999939, Epsilon: 0.1720362040159844
Episode: 8000, Mean reward: 0.9723999999999964, Epsilon: 0.1339819304042855
Episode: 9000, Mean reward: 0.9846999999999978, Epsilon: 0.10434523231627953
Episode: 10000, Mean reward: 0.9808999999999964, Epsilon: 0.09997847803039572
Time: 36.44415831565857
Episode: 0, Mean reward: 0.0, Epsilon: 0.99
Episode: 1000, Mean reward: -2.2387999999984456, Epsilon: 0.7710127752407178
Episode: 2000, Mean reward: -0.04959999999995778

Loading alternating trained Q-Table and checking if successful

In [None]:
trained_a_simple = load('Trajectory - Simple - Alternating.pkl')
print(trained_a_simple)

trained_a_mid = load('Trajectory - Mid - Alternating.pkl')
print(trained_a_mid)

trained_a_hard = load('Trajectory - Hard - Alternating.pkl')
print(trained_a_hard)

[[0.06461082 0.0717898  0.06461082 0.0717898 ]
 [0.0717898  0.06461082 0.07976644 0.07976644]
 [0.0717898  0.06461082 0.05814974 0.07976644]
 [0.09847709 0.05233477 0.08862938 0.10941899]
 [0.09847709 0.12157665 0.09847709 0.10941899]
 [0.12157665 0.18530202 0.09847709 0.13508517]
 [0.         0.         0.         0.        ]
 [0.16677181 0.22876792 0.25418657 0.16677181]
 [0.13508517 0.13508517 0.25418657 0.0717898 ]
 [0.12157665 0.28242952 0.07976644 0.0717898 ]
 [0.06461082 0.15009463 0.0717898  0.07976644]
 [0.0717898  0.05814974 0.04710129 0.0717898 ]
 [0.0717898  0.0717898  0.04239116 0.05233477]
 [0.09847709 0.09847709 0.09847709 0.05814974]
 [0.12157665 0.12157665 0.12157665 0.18530202]
 [0.13508517 0.13508517 0.16677181 0.20589113]
 [0.20589113 0.20589113 0.12157665 0.25418657]
 [0.25418657 0.3138106  0.22876792 0.25418657]
 [0.13508517 0.28242952 0.18530202 0.22876792]
 [0.20589113 0.34867844 0.25418657 0.25418657]
 [0.0717898  0.15009463 0.05814974 0.05814974]
 [0.04239116 

In [None]:
from google.colab import files

files.download('Trajectory - Simple - Alternating.pkl')
files.download('Trajectory - Mid - Alternating.pkl')
files.download('Trajectory - Hard - Alternating.pkl')

files.download('Rewards - Simple - Alternating.pkl')
files.download('Rewards - Mid - Alternating.pkl')
files.download('Rewards - Hard - Alternating.pkl')

files.download('Time - Simple - Alternating.pkl')
files.download('Time - Mid - Alternating.pkl')
files.download('Time - Hard - Alternating.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---

# Deep Q-Learning

---

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

Checking availability of CUDA

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


Defining the neural network

In [12]:
class DeepQNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DeepQNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)

        return x

Defining the exploration-exploit function


In [13]:
def compute_action_torch(environment, epsilon, policy_net):

    if np.random.uniform(0,1) < epsilon:
        return np.random.choice(range(len(environment.action_space))) # Exploration

    else:
        current_state = torch.FloatTensor(environment.grid).unsqueeze(0)
        q = policy_net(current_state)
        return torch.argmax(q).item() # Exploit

Defining the optimizing function


In [38]:
def optimize(memory, policy_net, target_net, gamma, optimizer, batch_size = 100):

    if len(memory) < batch_size:
        return

    batch = random.sample(memory, batch_size)
    state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

    state_batch = torch.FloatTensor(np.array(state_batch))
    action_batch = torch.LongTensor(np.array(action_batch)).unsqueeze(1)
    reward_batch = torch.FloatTensor(np.array(reward_batch))
    next_state_batch = torch.FloatTensor(np.array(next_state_batch))
    done_batch = torch.FloatTensor(np.array(done_batch))

    # Compute Q-values for current states
    q_values = policy_net(state_batch).gather(1, action_batch).squeeze()

    # Compute target Q-values using the target network
    with torch.no_grad():
        max_next_q_values = target_net(next_state_batch).max(1)[0]
        target_q_values = reward_batch + gamma * max_next_q_values * (1 - done_batch)

    loss = nn.MSELoss()(q_values, target_q_values)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Function to save the weights of the policy_net and the target_net

In [15]:
def save_weights(filename, neural_net):
    torch.save(neural_net.state_dict(), filename)

In [16]:
def load_weights(filename, blank_net):
    blank_net.load_state_dict(torch.load(filename))

Defining the Deep Q-Learning function - il faut implémenter le fait que le learning commence au bout d'une certaine itération

In [145]:
def deep_q_learning(env, memory, policy_net, target_net, optimizer, alpha=0.1, gamma=0.9,  epsilon=0.1, epsilon_decay=0.006, target_update_freq = 1000, episodes = 1000):

    steps = 0
    rewards = []
    start_time = time.time()
    #environments_name = ['Simple', 'Mid', 'Hard']

    for e in range(episodes): #Run 1k training runs

        #env = random.choice(environments)
        #print('Environment :', environments_name[environments.index(env)])
        state = env.reset() #Part of OpenAI where you need to reset at the start of each run
        total_reward = 0 #Set initial reward to 0
        step_per_episode = 0
        time_per_episode = time.time()

        while True: #Loop until done == True
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]

            #current_state_index = env.current_pos[0] + env.current_pos[1]*env.observation_space[0] # Obtain the index of the state

            current_state = np.copy(env.grid)

            action = compute_action_torch(env, epsilon, policy_net) # Compute the action for the current state in function of the epsilon_greedy

            posp1, new_state, reward, done = env.step(action)

            #state_tp1_index = posp1[0] + posp1[1]*env.observation_space[0]

            memory.append((current_state, action, reward, new_state, done))

            total_reward += reward #Increment your reward

            optimize(memory, policy_net, target_net, gamma, optimizer)

            if steps % target_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())

            if step_per_episode >= 4000:
                done = True

            if done:
                print(f"Episode: {e}, Reward: {total_reward}, Steps in the episode: {step_per_episode}", f"Time: {time.time() - time_per_episode}")
                break

            steps += 1
            step_per_episode += 1


        if epsilon>0.1:
            epsilon *= np.exp(-epsilon_decay)

        rewards.append(total_reward)

    delta_time = start_time - time.time()
    print(f"Time: {delta_time}")

    save_weights('policy_net_weights_simple.pth', policy_net)
    save_weights('target_net_weights_simple.pth', target_net)

    return rewards, delta_time, steps

Initialization of the agent

In [142]:
policy_net = DeepQNetwork(STATE_SIZE[0]*STATE_SIZE[1], 150, ACTION_SIZE)
target_net = DeepQNetwork(STATE_SIZE[0]*STATE_SIZE[1], 150, ACTION_SIZE)
target_net.load_state_dict(policy_net.state_dict())

learning_rate = 1
memory_size = 100000

optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate)
memory = deque(maxlen = memory_size)

environments = [environment_simple, environment_mid, environment_hard]

Running of the training

In [146]:
dql_training, delta_time, _ = deep_q_learning(environment_simple, memory, policy_net, target_net, optimizer)
#dql_mid = deep_q_learning(environment_mid, memory, policy_net, target_net, optimizer)
#dql_hard = deep_q_learning(environment_hard, memory, policy_net, target_net, optimizer)

Episode: 0, Reward: 0.7, Steps in the episode: 67 Time: 0.34408140182495117
Episode: 1, Reward: -0.5000000000000002, Steps in the episode: 252 Time: 1.252807855606079
Episode: 2, Reward: 0.4, Steps in the episode: 128 Time: 0.6090304851531982
Episode: 3, Reward: 0.9, Steps in the episode: 78 Time: 0.41440320014953613
Episode: 4, Reward: -0.30000000000000004, Steps in the episode: 165 Time: 0.768040657043457
Episode: 5, Reward: 0.8, Steps in the episode: 67 Time: 0.32953929901123047
Episode: 6, Reward: 0.10000000000000009, Steps in the episode: 234 Time: 1.1532926559448242
Episode: 7, Reward: 0.20000000000000007, Steps in the episode: 87 Time: 0.464993953704834
Episode: 8, Reward: 0.7, Steps in the episode: 55 Time: 0.2763967514038086
Episode: 9, Reward: 0.6, Steps in the episode: 25 Time: 0.13501954078674316
Episode: 10, Reward: 0.4, Steps in the episode: 131 Time: 0.6071820259094238
Episode: 11, Reward: -0.30000000000000004, Steps in the episode: 121 Time: 0.5975897312164307
Episode: 

KeyboardInterrupt: 

Initializing empty models

In [None]:
policy_net_trained = DeepQNetwork(STATE_SIZE[0]*STATE_SIZE[1], 150, ACTION_SIZE)
target_net_trained = DeepQNetwork(STATE_SIZE[0]*STATE_SIZE[1], 150, ACTION_SIZE)

Loading trained weights

In [None]:
load_weights('policy_net_weights.pth', policy_net_trained)
load_weights('target_net_weights.pth', target_net_trained)

---

# Spatial Computing for Path Planning - SCPP - Personalized algorithm

---

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

Checking availability of CUDA

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


Defining the neural network

In [95]:
class NeuralAgent(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralAgent, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)

        return x

Function to compute the input tensor during training

In [96]:
def calculate_input_tensor(env):
    # Get the dimensions of the grid
    height, width = env.observation_space

    # Get the agent's position
    x, y = env.current_pos

    # Initialize the input tensor with zeros
    input_tensor = np.zeros(6)

    # Calculate the distance to the nearest obstacle in the left direction
    for i in range(x-1, -1, -1):
        if env.grid[y][i] == 1:
            input_tensor[0] = x - i
            break

    # Calculate the distance to the nearest obstacle in the right direction
    for i in range(x+1, height):
        if env.grid[y][i] == 1:
            input_tensor[1] = i - x
            break

    # Calculate the distance to the nearest obstacle in the up direction
    for j in range(y-1, -1, -1):
        if env.grid[j][x] == 1:
            input_tensor[2] = y - j
            break

    # Calculate the distance to the nearest obstacle in the down direction
    for j in range(y+1, width):
        if env.grid[j][x] == 1:
            input_tensor[3] = j - y
            break

    # Calculate the distance to the goal in the horizontal direction
    # Check if the goal exists in the grid before accessing its position
    goal_indices = np.argwhere(env.grid == 2)
    if goal_indices.size > 0:  # Check if goal_indices is not empty
        goal_position = goal_indices[0]  # Access the first goal position if it exists
        input_tensor[4] = goal_position[1] - y
        input_tensor[5] = goal_position[0] - x
    else:
        # Handle the case where the goal is not found (e.g., set distances to a default value)
        input_tensor[4] = 0  # Or some other appropriate default
        input_tensor[5] = 0  # Or some other appropriate default

    # Convert the input tensor to a PyTorch tensor
    input_tensor = torch.tensor(input_tensor, dtype=torch.float32).unsqueeze(0)

    return input_tensor

Epsilon-greedy for SCPP

In [124]:
def compute_action_scpp_torch(environment, epsilon, agent):

    if np.random.uniform(0,1) < epsilon:
        return np.random.choice(range(4)) # Exploration

    else:
        probs = agent(environment)
        return torch.argmax(probs).item() # Exploit

Definition of the training

# DOIT IMPLEMENTER EPSILON-GREEDY

In [121]:
def scpp(agent, env, num_episodes, criterion, optimizer, epsilon = 1, epsilon_decay = 0.003):
    # Train the agent for a specified number of episodes

    start_time = time.time()
    rewards = []

    for episode in range(num_episodes):
        # Reset the environment
        state = env.reset()

        # Initialize the episode reward
        episode_reward = 0
        steps = 0
        # Loop through the episode
        while True:
            # Convert the state to a PyTorch tensor
            state_tensor = calculate_input_tensor(env)

            # Forward pass
            action = compute_action_scpp_torch(state_tensor, epsilon, agent)

            # Sample an action from the action probabilities
           # action = torch.multinomial(action_probs, num_samples=1).item()


            # Take the action and observe the next state and reward
            posp1, next_state, reward, done = env.step(action)

            # Update the episode reward
            episode_reward += reward

            # Convert the next state to a PyTorch tensor
            next_state_tensor = calculate_input_tensor(env)

            # Calculate the target tensor

            with torch.no_grad():
                target = agent(next_state_tensor)

            # Calculate the loss
            loss = criterion(target, torch.tensor([action]))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update the state
            state = next_state
            if steps > 1000:
              done = True
            # Check if the episode is done
            if done:
                break

            steps +=1

        rewards.append(episode_reward)

        # Print the episode reward every 100 episodes
       # if (episode + 1) % 100 == 0:
            #print(f'Episode [{episode+1}/{num_episodes}], Episode Reward: {episode_reward:.2f}')
        print(f"Episode: {episode}, Reward: {episode_reward}")

    save_weights('scpp_weights_simple.pth', policy_net)

    return rewards

Initializing an agent

In [122]:
scpp_agent = NeuralAgent(6, 128, 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(scpp_agent.parameters(), lr=0.001)

Running of the training of the SCPP Agent

In [125]:
scpp(scpp_agent, environment_simple, 1000, criterion, optimizer)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Loading of empty agent

Loading of trained weights