In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/RL/RL

Mounted at /content/drive
/content/drive/MyDrive/RL/RL


In [None]:
!pip install gymnasium

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gymnasium
  Downloading gymnasium-0.28.1-py3-none-any.whl (925 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
Collecting jax-jumpy>=1.0.0 (from gymnasium)
  Downloading jax_jumpy-1.0.0-py3-none-any.whl (20 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, jax-jumpy, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.28.1 jax-jumpy-1.0.0


In [None]:
!pip install -q swig
!pip install -q gymnasium[box2d]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone


In [None]:
import os
import sys
import torch
import gymnasium as gym
import time
import math
sys.path.append(os.path.abspath("../RL/agent/"))
sys.path.append(os.path.abspath("../RL/Network/"))
from agent import *

In [None]:
device = torch.device("cuda:0")
torch.set_default_device(device)

In [None]:
import os
import sys
import copy as cp
from torch.distributions.uniform import Uniform
import numpy as np

class Agent():
    def __init__(self, env):
        self.env = env
    
    def act(self, obs):
        pass

    def store(self, obs, action, new_obs, reward):
        pass

class AgentRandom(Agent):
    def __init__(self, env):
        super().__init__(env)

    def act(self, obs):
        return self.env.action_space.sample()

    def store(self, obs, action, new_obs, reward):
        pass

class AgentPolicy(Agent):
    def __init__(self, env, pi):
        super().__init__(env)
        self.pi = pi

    def act(self, obs):
        return self.pi[obs]

    def store(self, obs, action, new_obs, reward):
        pass

class AgentDQN(Agent):
    def __init__(self,env, gamma=0.99, batch=32):
        super().__init__(env)
        self.taille_state = self.env.observation_space.shape[0] #Box
        self.taille_action = self.env.action_space.n
        self.batch_size = batch
        self.buffer = Buffer(5*self.batch_size)
        # NN
        self.dqn = DQN(self.taille_state, self.taille_action)
        # Hyper-param
        self.explore = 1.0
        self.explore_min = 0.01
        self.explore_decay = 0.995
        self.gamma = gamma
        
    def act(self, state):
        if torch.rand(1).item() < self.explore: # solf greedy
            return torch.randint(0, self.taille_action, (1,)).item()
        return torch.argmax(self.dqn(state)).item()
    
    def act_opt(self, state):
        return torch.argmax(self.dqn(state)).item()
    
    def setDQN(self, path):
        self.dqn = torch.load(path)

    def store(self, state, reward, action, done,  state_suivant):
        self.buffer.add([state, reward, action, done, state_suivant])

    def replay(self, batch_seuil, decay):
        if self.buffer.getLen() < batch_seuil:
            return
        mini_batch = self.buffer.sampleState(self.batch_size)
        for state, reward, action, done, state_suivant in mini_batch:
            y_action = reward + self.gamma*torch.max(self.dqn.forward(state_suivant)).detach().item()
            if done:
                y_action = reward # bah, si done -> perdu donc faut savoir pour eviter
            y = self.dqn.forward(state)
            y[0][action] = y_action # tel action amene a tel score
            self.dqn.fit(state, y, epoch=1) 
        if decay and self.explore > self.explore_min:
            self.explore *= self.explore_decay

class AgentDQN_TargetNetwork(AgentDQN):
    def __init__(self, env, gamma=0.99, batch=32, K=32):
        super().__init__(env, gamma, batch)
        self.K = K # nombre de pas pour maj Target Network
        self.counterK = 0 
        self.dqnTarget = cp.deepcopy(self.dqn)

    def replay(self, batch_seuil, decay):
        if self.buffer.getLen() < batch_seuil:
            return
        mini_batch = self.buffer.sampleState(self.batch_size)
        for state, reward, action, done, state_suivant in mini_batch:
            # target est calculé par Target Network
            y_action = reward + self.gamma*torch.max(self.dqnTarget.forward(state_suivant)).detach().item()
            if done:
                y_action = reward # bah, si done -> no more futur
            y = self.dqnTarget.forward(state)
            y[0][action] = y_action # tel action amene a tel score, 1 batch, tensor donc [0] ..
            # MAJ Q-network
            self.dqn.fit(state, y, epoch=1)
            self.counterK += 1
            # Update target network every K steps ...
            if self.counterK == self.K:
                self.dqnTarget.updateParam(self.dqn)
                self.counterK = 0 # reset counter, ugly code ...
        if decay and self.explore > self.explore_min:
            self.explore *= self.explore_decay



class AgentDDPG(Agent):
    def __init__(self, env, gamma=0.99, batch=64, tau=0.001):
        super().__init__(env)
        # continuous environment
        assert env.continuous == True
        # param agent
        self.taille_state = self.env.observation_space.shape[0] #Box
        self.taille_action = self.env.action_space.shape[0] #Box
        self.batch_size = batch
        self.buffer = Buffer(100000)
        # Neural networks
        self.dqnCritic = DQN(self.taille_state + self.taille_action, 1)
        self.dqnTargetCritic = cp.deepcopy(self.dqnCritic)
        self.actor = NNActor(self.taille_state , self.taille_action) # I make only 1 actor here
        self.actorTarget = cp.deepcopy(self.actor)
        # Optim
        self.optQ = torch.optim.Adam(self.dqnCritic.parameters(), lr = 0.0002) # optim Critic
        self.optTargetQ = torch.optim.Adam(self.dqnTargetCritic.parameters(), lr = 0.0002) # optim Target
        self.optActor = torch.optim.Adam(self.actor.parameters(), lr = 0.0003) # optim Actor
        self.optActorTarget = torch.optim.Adam(self.actorTarget.parameters(), lr = 0.0003)
        # Loss
        self.f_loss = torch.nn.MSELoss()
        # Hyper-param
        self.explore = 1.0
        self.explore_min = 0.01
        self.explore_decay = 0.995
        self.gamma = gamma
        self.tau = tau # Tau to update Target Network
        self.sigma = 0.1 # noise
        self.step_count = 0
        self.start_learning = 10000

    def act(self, state):
        self.step_count += 1
        if torch.rand(1).item() < self.explore: # solf greedy
            if self.env.unwrapped.spec.id == "LunarLander-v2":
                return np.random.uniform(-1,1,(2,)) # np.array([main, lateral])
        if self.step_count>0 and self.step_count%5000==0: #decay noise every 5000 steps
            self.sigma *= 0.95
        a = self.actor(state)[0].cpu().detach().numpy()
        a = addGaussianNoise(a, self.sigma)
        return a
    
    def act_opt(self, state): # 1 actor
        return self.actor(state)[0].detach().cpu().numpy()
    
    def setActor(self, path): 
        self.actor = torch.load(path)

    def store(self, state, reward, action, done,  state_suivant):
        self.buffer.add([state, reward, action, done, state_suivant])

    def updateTargetDDPG(self):
        # critic
        for param, target_param in zip(self.dqnCritic.parameters(), self.dqnTargetCritic.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        # actor
        for param, target_param in zip(self.actor.parameters(), self.actorTarget.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def updateNetworks(self, state, reward, action, done, state_suivant):
        state_suivant.requires_grad = True # to update Actor
        # prepare data actor
        outActor_suivant = self.actorTarget.forward(state_suivant) # pi(state_suivant)
        # prepare data critic
        input_Q = torch.hstack((state, action))
        input_Q_suivant = torch.hstack((state_suivant, outActor_suivant)) 
        #
        input_Q = input_Q.to(torch.float32)
        Q = self.dqnCritic.forward(input_Q) # y_hat
        Q_suivant = self.dqnTargetCritic.forward(input_Q_suivant)
        y = reward + self.gamma*Q_suivant
        if done:
            y = torch.tensor(reward, dtype=torch.float32).view(1,-1)
        # Critic, 1 epoch
        loss = self.f_loss(Q, y)
        self.optQ.zero_grad()
        loss.backward()
        self.optQ.step()

    def addReward(self, reward):
        self.buffer.addReward(reward)
        return self.buffer.getLastMean()
    
    def saveNet(self, path):
        torch.save(self.actor, path)

    def loadNet(self, path):
        self.actor = torch.load(path)

    def replay(self, decay):
        if self.buffer.getLen() < self.start_learning:
            return
        mini_batch = self.buffer.sampleState(self.batch_size)
        batch_state = []
        for state, reward, action, done, state_suivant in mini_batch:
            batch_state.append(state.squeeze(0))
            self.updateNetworks(state, reward, action, done, state_suivant)
        batch_state = torch.stack(batch_state)
        # Actor
        outActor = self.actor.forward(batch_state)
        intputQ_actor_update = torch.hstack((batch_state, outActor))
        lossActor = -self.dqnCritic(intputQ_actor_update).mean() # if Q is bad -> loss is positif, Q is good -> loss is neg (good thing)
        self.optActor.zero_grad()
        lossActor.backward()
        self.optActor.step()
        # Target
        self.updateTargetDDPG()
        if decay and self.explore > self.explore_min:
            self.explore *= self.explore_decay






In [None]:
import torch
from random import sample
from torch.distributions import Normal

class DQN(torch.nn.Module):
    def __init__(self, taille_state, taille_action, lr = 0.000025, hidden = 256):
        super().__init__()
        self.taille_state = taille_state
        self.taille_action = taille_action
        self.lr = lr
        self.hidden = hidden
        self.net = torch.nn.Sequential(
                        torch.nn.Linear(self.taille_state, 256),
                        torch.nn.ReLU(),
                        torch.nn.Linear(256, 256),
                        torch.nn.ReLU(),
                        torch.nn.Linear(256, self.taille_action)
        )
        self.optim = torch.optim.Adam(self.parameters(), lr = self.lr)
        self.f_loss = torch.nn.MSELoss()

    def forward(self, x):
        return self.net(x)
    
    def fit(self, x, y, epoch=10):
        for _ in range(epoch):
            y_hat = self.forward(x)
            loss = self.f_loss(y_hat, y)
            loss.backward()
            self.optim.step()
            self.optim.zero_grad()

    def updateParam(self, dqn):
        """
            Pour Target Network, copier les parametres d'autre DQN
        """
        self.load_state_dict(dqn.state_dict())

    def getNet(self):
        return self.net

class NNActor(torch.nn.Module):
    def __init__(self, taille_state, taille_action, lr = 0.00025, hidden = 256):
        super().__init__()
        self.taille_state = taille_state
        self.taille_action = taille_action
        self.lr = lr
        self.hidden = hidden
        self.net = torch.nn.Sequential(
                        torch.nn.Linear(self.taille_state, 256),
                        torch.nn.ReLU(),
                        torch.nn.Linear(256, 256),
                        torch.nn.ReLU(),
                        torch.nn.Linear(256, self.taille_action),
                        torch.nn.Tanh() # lunar action [-1,1]
        )
        self.optim = torch.optim.Adam(self.parameters(), lr = self.lr)
        self.f_loss = torch.nn.MSELoss()

    def forward(self, x):
        return self.net(x)

    def getNet(self):
        return self.net

class Buffer:
    def __init__(self, taille_max):
        self.taille = taille_max
        self.memoire = []
        self.last100 = []
        self.mean = []
        
    def getLen(self):
        return len(self.memoire)
      
    def getLastMean(self):
        return self.mean[-1]
    
    def add(self, element):
        if self.getLen() >= self.taille:
            del self.memoire[0]
        self.memoire.append(element)

    def sampleState(self, taille_sample):
        return sample(self.memoire, taille_sample)

    def addReward(self, reward):
        if len(self.last100) >= 100:
            del self.last100[0]
        self.last100.append(reward)
        self.mean.append(sum(self.last100)/len(self.last100))
    
        
    
def addGaussianNoise(action, sigma=0.1):
    a = torch.tensor(action)
    dist = Normal(a, sigma)
    return dist.sample().cpu().detach().numpy()

In [None]:
if True:
    f = open('score', 'w')
    env = gym.make("LunarLander-v2",continuous = True)
    TAILLE_BATCH = 64
    gamma = 0.99
    nb_episode = 500
    tau = 0.001
    #f = open("score/reward", "w")
    agent = AgentDDPG(env, gamma, TAILLE_BATCH, tau)
    score_max = -math.inf
    path_best = 'best.pt'
    for i in range(nb_episode):
        #state, _ = env.reset(seed=0)
        state, _ = env.reset()
        state = torch.as_tensor(state).view(1,-1) # to tensor et reshape (Batch, blah blah)
        cum_reward = 0
        for frame in range(1200):
            action = agent.act(state)
            #action = addGaussianNoise(action, sigma = 0.1) # add Gaussian Noise ...
            state_suivant, reward, done, _, info = env.step(action)
            state_suivant = torch.as_tensor(state_suivant).view(1,-1)
            #transform action 
            action = torch.as_tensor(action).view(1,-1)
    
            agent.store(state, reward, action, done, state_suivant)
            agent.replay(i>=70) # sample batch scenario de 1.5*batch
            cum_reward += reward
            if done:
                if i > 0 and i % 100 == 0:
                    agent.saveNet(f"net{i}.pt")
                score_max = max(score_max, cum_reward)
                last_mean = agent.addReward(cum_reward)
                print(f"Episode {i}/{nb_episode}, fini à {frame} frame, explore {agent.explore},score: {cum_reward}, last 100: {last_mean}")
                f.write(f"{cum_reward} {last_mean}\n")
                break
            state = torch.tensor(state_suivant)
    f.close()
    torch.save(agent.actor, path_best)
    print("max:", score_max)

  state = torch.tensor(state_suivant)


Episode 0/500, fini à 126 frame, explore 1.0,score: -404.68077915795556, last 100: -404.68077915795556
Episode 1/500, fini à 157 frame, explore 1.0,score: -319.633422721074, last 100: -362.15710093951475
Episode 2/500, fini à 106 frame, explore 1.0,score: -221.76442104400678, last 100: -315.3595409743454
Episode 3/500, fini à 142 frame, explore 1.0,score: -157.39308288991447, last 100: -275.8679264532377
Episode 4/500, fini à 84 frame, explore 1.0,score: -253.77638018663345, last 100: -271.4496171999169
Episode 5/500, fini à 145 frame, explore 1.0,score: -233.4077216269602, last 100: -265.10930127109077
Episode 6/500, fini à 70 frame, explore 1.0,score: -63.77792306601084, last 100: -236.3476758132222
Episode 7/500, fini à 98 frame, explore 1.0,score: -344.8074108219017, last 100: -249.90514268930713
Episode 8/500, fini à 99 frame, explore 1.0,score: -174.28545723328145, last 100: -241.50295541641538
Episode 9/500, fini à 72 frame, explore 1.0,score: -52.97761264703472, last 100: -222.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Episode 499/500, fini à 159 frame, explore 0.00998645168764533,score: 230.5586429304826, last 100: 220.3600349639212
OSError: [Errno 107] Transport endpoint is not connected

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-17-528f46f51569>", line 37, in <cell line: 1>
    f.close()
OSError: [Errno 107] Transport endpoint is not connected

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call