<a href="https://colab.research.google.com/github/martinkopecky98/Checkpoint-1/blob/master/DQN_LunarLander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gym
!pip install Box2D
!pip install torch
!pip install numpy

Collecting Box2D
[?25l  Downloading https://files.pythonhosted.org/packages/a9/0b/d48d42dd9e19ce83a3fb4eee074e785b6c6ea612a2244dc2ef69427d338b/Box2D-2.3.10-cp36-cp36m-manylinux1_x86_64.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 4.3MB/s 
[?25hInstalling collected packages: Box2D
Successfully installed Box2D-2.3.10


In [None]:

import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40)
import numpy as np
import torch
import copy
import numpy as np
from random import randrange
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import glob
import io
import base64


In [None]:

class ExperienceReplay():
    def __init__(self, size, state_dim):
        self.index = 0
        self.size = size
        self.state_dim = state_dim
        dim = (size, ) + state_dim

        self.states = torch.zeros(dim)
        self.actions = torch.zeros((self.size), dtype=torch.int8)
        self.rewards = torch.zeros(self.size)
        self.states_ = torch.zeros(dim)
        self.terminals = torch.zeros(self.size)

    def store(self, state, action, reward, state_, terminal):
        index = self.index % self.size

        self.states[index] = state
        self.actions[index] = action
        self.rewards[index] = reward
        self.states_[index] = state_
        self.terminals[index] = int(terminal)

        self.index += 1

    def sample(self, batch_size):
        length = min(self.size, self.index)

        batch = np.random.choice(length, batch_size)
        states =  self.states[batch]
        actions = self.actions[batch]
        rewards = self.rewards[batch]
        states_ = self.states_[batch]
        terminal = self.terminals[batch]

        return states, actions, rewards, states_, terminal

class AgentDQN:
    def __init__(self, gamma, actions_count, model, experience_replay, lr,
                 update_steps = 1000, batch_size = 64,
                 epsilon=1.0, epsilon_dec = 1e-4, epsilon_min = 0.01):

        self.gamma = gamma
        self.actions_count = actions_count
        self.online_model = model
        self.target_model = copy.deepcopy(model)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print('device: ', self.device)
        self.online_model.to(self.device)
        self.target_model.to(self.device)
        for param in self.target_model.parameters():
            param.requires_grad = False
        self.mse = nn.MSELoss()
        self.experience_replay = experience_replay
        self.optimizer = optim.Adam(self.online_model.parameters(), lr=lr)
        self.update_steps = update_steps
        self.current_steps = 0
        self.batch_size = batch_size
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec

    def choose_action(self, state):
        r = np.random.random()

        if np.random.random() > self.epsilon:
            action = randrange(self.actions_count)
            return action
        else:
            state = state.unsqueeze(0).to(self.device).float()
            with torch.no_grad():
                actions = self.online_model(state)
            action = torch.argmax(actions).item()
            return action

    def store(self, state, action, reward, state_, terminal):
        self.experience_replay.store(state, action, reward, state_, terminal)

    def learn(self):
        if self.experience_replay.index < 100:
            return

        self.optimizer.zero_grad()
        states, actions, rewards, states_, terminals = self.experience_replay.sample(self.batch_size)

        q_y = self.online_model(states.to(self.device))
        q_target = q_y.detach().cpu()
        q_next = self.target_model(states_.to(self.device)).cpu()

        for i in range(0, len(states)):
            q_target[i, actions[i]] = rewards[i] + self.gamma * torch.max(q_next[i]) * (1 - terminals[i])

        loss = self.mse(q_y, q_target.to(self.device))
        loss.backward()
        self.optimizer.step()

        self.current_steps += 1
        if self.current_steps == self.update_steps:
            self.target_model.load_state_dict(self.online_model.state_dict())
            self.current_steps = 0

        if self.epsilon > self.epsilon_min:
            self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min)

import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.fc1 = nn.Linear(8,8)
        self.fc2 = nn.Linear(8,8)
        self.fc3 = nn.Linear(8,4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

env = gym.make('LunarLander-v2')
actions = 4
state_dim = (8, )

# env = gym.make('LunarLander-v2')
# actions = 3
# state_dim = (2, )


games_count = 500
scores = []

experience_replay = ExperienceReplay(10000, state_dim)
agent = AgentDQN(0.99, actions, Net(), experience_replay, 0.001)

for i in range(0, games_count):

    score = 0
    terminal = False
    state = env.reset()
    state = torch.from_numpy(state).double()

    while not terminal:
        action = agent.choose_action(state)
        state_, reward, terminal, _ = env.step(action)
        state_ = torch.from_numpy(state_).double()
        agent.experience_replay.store(state, action, reward, state_, terminal)
        
        #doplniť metódu učenia
        agent.learn()
        state = state_
        score += reward

    scores.append(score)

    if i % 5 == 0:
      print('episode: ', i, '\t\tscore: ', + score, '\t\taverage score:' , np.average(scores[-100:]), '\t\tepsilon: ', agent.epsilon)


device:  cpu
episode:  0 		score:  -156.8139133509145 		average score: -156.8139133509145 		epsilon:  1.0
episode:  5 		score:  -158.04975927851325 		average score: -137.1479602037306 		epsilon:  0.9665000000000037
episode:  10 		score:  -155.62877738714585 		average score: -136.22209597337903 		epsilon:  0.927400000000008
episode:  15 		score:  -100.62229401649964 		average score: -133.62795004489533 		epsilon:  0.8889000000000122
episode:  20 		score:  -139.07277212711907 		average score: -140.52502157930397 		epsilon:  0.8515000000000164
episode:  25 		score:  -115.6785673148405 		average score: -139.6296186052805 		epsilon:  0.8120000000000207
episode:  30 		score:  -114.1737207201602 		average score: -142.14894925888353 		epsilon:  0.773400000000025
episode:  35 		score:  -127.71987916706266 		average score: -141.2273965676333 		epsilon:  0.7343000000000293
episode:  40 		score:  -110.14686592881245 		average score: -140.9771847881115 		epsilon:  0.6953000000000336
episode:  45 		

KeyboardInterrupt: ignored