In [1]:
import gym
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import random

In [2]:
class NN(nn.Module):
    def __init__(self, state_dim, action_n):
        super().__init__()
        self.linear1 = nn.Linear(state_dim, 32)
        self.linear2 = nn.Linear(32, 32)
        self.linear3 = nn.Linear(32, action_n)
        self.relu = nn.ReLU()

    def forward(self, state):
        hidden = self.linear1(state)
        hidden = self.relu(hidden)
        hidden = self.linear2(hidden)
        hidden = self.relu(hidden)
        qvalues = self.linear3(hidden)
        return qvalues

In [3]:
class DQN():
    def __init__(self, action_n, model, batch_size, gamma, lr, trajectory_n, kind='default'):
        self.kind = kind
        self.action_n = action_n
        self.model = model
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        
        self.epsilon = 1
        self.epsilon_decrease = 1 / trajectory_n
        self.memory = []
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

    def get_action(self, state):
        qvalues = self.model(torch.FloatTensor(state)).detach().numpy()
        prob = np.ones(self.action_n) * self.epsilon / self.action_n
        argmax_action = np.argmax(qvalues)
        prob[argmax_action] += 1 - self.epsilon
        action = np.random.choice(np.arange(self.action_n), p=prob)
        return action

    def get_batch(self):
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, dones, next_states = [], [], [], [], []
        for i in range(len(batch)):
            states.append(batch[i][0])
            actions.append(batch[i][1])
            rewards.append(batch[i][2])
            dones.append(batch[i][3])
            next_states.append(batch[i][4])
        states = torch.FloatTensor(states)
        next_states = torch.FloatTensor(next_states)
        return states, actions, rewards, dones, next_states

    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, done, next_state])

        if len(self.memory) > self.batch_size:
            states, actions, rewards, dones, next_states = self.get_batch()

            qvalues = self.model(states)
            next_qvalues = self.model(next_states)
            
            if self.kind == 'default':
                targets = qvalues.clone()
                for i in range(self.batch_size):
                    targets[i][actions[i]] = rewards[i] + (1 - dones[i]) * self.gamma * torch.max(next_qvalues[i])

                loss = torch.mean((targets.detach() - qvalues) ** 2)
                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()

                self.epsilon = max(0, self.epsilon - self.epsilon_decrease)
            if self.kind == 'hard':
                # как параметры фиксировать для nn 
                pass
            if self.kind == 'soft':
                # как парметры фиксировать для nn
                pass
            if self.kind == 'double':
                # как парметры фиксировать для nn
                pass

In [4]:
env = gym.make('Acrobot-v1')
state_dim = env.observation_space.shape[0]
action_n = env.action_space.n

  deprecation(
  deprecation(
