Создадим кастомное окружение по образу и подобию тех, что лежат в ```gym/envs```

# 1. Проверка работоспособности нашего окружения

In [None]:
import numpy as np
from math import *
import gym
from IPython.display import clear_output
from time import time

In [None]:
from aircombat2d import Missile, Target, LineOfSight, AirCombat2D

In [None]:
def target_autopilot(*args):
    t, = args
#     if t < 3:
#         beta = 0
#     elif t < 5:
#         beta = 10
#     elif t < 7:
#         beta = -10
#     else:
#         beta = 0

    beta = -20

    return np.radians(beta)

In [None]:
opts = {'los': {
            'r0': 10e3, 
            'chi0': np.radians(0)
        }, 
        'missile':{
            'initial_state': {
                'vel0': 400,
                'psi0': np.radians(180)
            },
            'energetics': {
                't_act': 5,
                'omega_act': 45
            },
            'aerodynamics': {
                'Ms': np.array([0.1, 0.3, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1., 1.05, 
                                1.1, 1.15, 1.5, 1.9, 2.3, 2.7, 3.1, 3.5, 3.9, 4.3, 4.7]),
                'Cx0_arr': np.array([0.2958, 0.2998, 0.3078, 0.3198, 0.3234, 0.3273, 0.3314, 
                                     0.3358, 0.3404, 0.3453, 0.3504, 0.3558, 0.3614, 0.4426, 
                                     0.3598, 0.2885, 0.2376, 0.2009, 0.1741, 0.1533, 0.1377, 0.1255]),
                'Cya_arr': np.array([0.0571, 0.0577, 0.0591, 0.0615, 0.0623, 0.0633, 0.0643, 
                                     0.0655, 0.0671, 0.0708, 0.0794, 0.0814, 0.0802, 0.0664,
                                     0.0543, 0.0465, 0.042, 0.0387, 0.0358, 0.0331, 0.0312, 0.0291]),
                'Cyb_arr': np.array([0.0571, 0.0577, 0.0591, 0.0615, 0.0623, 0.0633, 0.0643, 
                                     0.0655, 0.0671, 0.0708, 0.0794, 0.0814, 0.0802, 0.0664, 
                                     0.0543, 0.0465, 0.042, 0.0387, 0.0358, 0.0331, 0.0312, 0.0291])
            }
        }, 
        'target': {
            'vel0': 400,
            'psi0': np.radians(0)            
        }}

In [None]:
env = AirCombat2D(opts,altitude=15e3,target_autopilot=target_autopilot)
print("Action space:",env.action_space)
print("Observation space:", env.observation_space)

In [None]:
env = gym.wrappers.TimeLimit(AirCombat2D(opts,altitude=15e3,target_autopilot=target_autopilot).unwrapped, max_episode_steps=200)
# проводим инициализацию и запоминаем начальное состояние
s = env.reset()

done = False
it = 1
start = time()

while not done:
    it += 1
    timeit = time()
    
    # выполняем действие, получаем s, r, done
    s, r, done, _ = env.step(-1)
    
    # визуализируем окружение
    env.render()

#     print(f"Время выполнения {it} шага: {round(time() - timeit, 3)} с", end="\r")

clear_output(True)
print(f"Общее время: {round(time() - start, 3)} с;\nКол-во итераций: {it}.")

env.close()
if s[0] < 10:
    print("Успех!")
else:
    raise NotImplementedError("""
    Исправьте функцию выбора действия!""")

# 2. Создание DQN сети

In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import deque
from IPython.display import clear_output
import numpy as np
import matplotlib.pyplot as plt

# если видеокарта доступна, то будем ее использовать
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda:0


## 2.1 Определяем класс Q-Network

In [2]:
class QNetwork(nn.Module):
    def __init__(self, n_hid_n):
        """
        определение сети
        """
        super().__init__()
        
        self.fc1 = nn.Linear(8, n_hid_n)
        self.fc2 = nn.Linear(n_hid_n, n_hid_n)
        self.fc3 = nn.Linear(n_hid_n, 3)

    def forward(self, x):
        """
        определение графа вычислений
        :param x: вход
        :return: 
        """
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def sample_action(self, obs, epsilon):
        """
        сэмплирование действия
        :param obs: 
        :param epsilon: 
        :return: 
        """
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0, 2)
        else:
            return out.argmax().item()

## 2.2 Реализуем методы класса ReplayBuffer

In [3]:
class ReplayBuffer():

    def __init__(self, max_size):
        """
        создаем структуру для хранения данных
        """
        self.dq = deque(maxlen=max_size)
        
    def put(self, transition):
        """
        помещаем данные в replay buffer
        :param transition: (s, a, r, next_s, done_mask)
        :return:
        """
        self.dq.append(transition)

    def sample(self, n):
        """
        сэмплируем батч заданного размера
        :param n: размер мини-батча
        :return:
        """
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []

        # сэмплируем случайный батч и заполняем s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst
        for _ in range(n):
            index = random.randint(0, len(self.dq)-1)
            s_lst.append(self.dq[index][0])
            a_lst.append([self.dq[index][1]]) 
            r_lst.append([self.dq[index][2]]) 
            s_prime_lst.append(self.dq[index][3]) 
            done_mask_lst.append([self.dq[index][4]])

        return torch.tensor(np.array(s_lst), dtype=torch.float), torch.tensor(np.array(a_lst), dtype=torch.int64), \
               torch.tensor(np.array(r_lst)), torch.tensor(np.array(s_prime_lst), dtype=torch.float), \
               torch.tensor(np.array(done_mask_lst))

    def __len__(self):
        """
        возвращает размер replay buffer'а
        :return: len(replay_buffer)
        """
        return len(self.dq)

## 2.3 Реализуем функцию тренировки нашей сети

In [4]:
def train(q, q_target, replay_buffer, optimizer, batch_size, gamma, updates_number=10):
    """
    тренируем нашу архитектуру
    :param q: policy сеть
    :param q_target: target сеть
    :param replay_buffer:
    :param optimizer:
    :param batch_size: размер мини-батча
    :param gamma: дисконтирующий множитель
    :param updates_number: количество обновлений, которые необходимо выполнить
    :return:
    """
    for i in range(updates_number):
        # сэмплируем мини-батч из replay buffer'а
        s, a, r, s_prime, done_mask = replay_buffer.sample(batch_size)

        # получаем полезность, для выбранного действия q сети
        q_out = q(s)
        q_a = q_out.gather(1, a)

        # получаем значение max_q target сети и считаем значение target
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask

        # определяем loss функцию, для q!
        loss = F.smooth_l1_loss(q_a, target.detach())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

## 2.4 Реализуем основную функцию, осуществляющую моделирование и обучение 

In [5]:
def run(q, 
        q_target,
        learning_rate, 
        gamma, 
        buffer_max_size, 
        batch_size, 
        target_update_interval,
        replay_buffer_start_size, 
        print_interval=20, 
        n_episodes=2000, 
        n_env_steps=1000, 
        render=False, 
        reward_history_plot=False,
        epsilon0=0.1):

    # копируем веса q в target_q
    q_target.load_state_dict(q.state_dict())

    # создаем replay buffer
    replay_buffer = ReplayBuffer(max_size=buffer_max_size)

    score = 0.0
    
    infos = []

    # инициализируем оптимизатор, полученным lr
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    for n_epi in range(n_episodes):   
        
        # создаем окружение
        env = AirCombat2D(opts, altitude=15e3, target_autopilot=target_autopilot)

        # постепенно изменяем eps с 8% до 1%
        epsilon = max(0.01, 0.1 - 0.01 * (n_epi / 200))
         
        s = env.reset()
        
        # выполянем n_env_steps шагов в окружении и сохраняем, полученные данные
        for t in range(n_env_steps):

            # получаем действие, используя сеть q
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)

            # выполняем действие в окружении
            s_prime, r, done, info = env.step(a)
            
            # добавляем данные в replay buffer
            done_mask = 0.0 if done else 1.0
            
            # сжимаем вознаграждения и добавляем в replay buffer
            replay_buffer.put((s, a, r / 100.0, s_prime, done_mask))

            s = s_prime

            score += r
            
            if render:
                env.render()

            if done:
                infos.append(info)
                break

        if len(replay_buffer) > replay_buffer_start_size:
            train(q, q_target, replay_buffer, optimizer, batch_size, gamma)

        if n_epi % target_update_interval == 0 and n_epi != 0:
            q_target.load_state_dict(q.state_dict())

        if n_epi % print_interval == 0 and n_epi != 0:
#             clear_output(True)
            c = collections.Counter(infos)
            print("# of episode :{}, avg score : {:.1f}, buffer size : {}, epsilon : {:.1f}%, {}".format(
                n_epi, score / print_interval, len(replay_buffer), epsilon * 100, c))
            infos = []
            score = 0.0
    env.close()
    
    return q

# 3. Определяем начальные параметры среды и подбираем гиперпараметры для обучения сети

In [6]:
from aircombat2d import Missile, Target, LineOfSight, AirCombat2D

In [7]:
def target_autopilot(*args):
    t, = args
    beta = 0
    return np.radians(beta)

In [8]:
opts = {'los': {
            'r0': 10e3, 
            'chi0': np.radians(0)
        }, 
        'missile':{
            'initial_state': {
                'vel0': 400,
                'psi0': np.radians(180)
            },
            'energetics': {
                't_act': 5,
                'omega_act': 45
            },
            'aerodynamics': {
                'Ms': np.array([0.1, 0.3, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1., 1.05, 
                                1.1, 1.15, 1.5, 1.9, 2.3, 2.7, 3.1, 3.5, 3.9, 4.3, 4.7]),
                'Cx0_arr': np.array([0.2958, 0.2998, 0.3078, 0.3198, 0.3234, 0.3273, 0.3314, 
                                     0.3358, 0.3404, 0.3453, 0.3504, 0.3558, 0.3614, 0.4426, 
                                     0.3598, 0.2885, 0.2376, 0.2009, 0.1741, 0.1533, 0.1377, 0.1255]) * 1.5,
                'Cya_arr': np.array([0.0571, 0.0577, 0.0591, 0.0615, 0.0623, 0.0633, 0.0643, 
                                     0.0655, 0.0671, 0.0708, 0.0794, 0.0814, 0.0802, 0.0664,
                                     0.0543, 0.0465, 0.042, 0.0387, 0.0358, 0.0331, 0.0312, 0.0291]),
                'Cyb_arr': np.array([0.0571, 0.0577, 0.0591, 0.0615, 0.0623, 0.0633, 0.0643, 
                                     0.0655, 0.0671, 0.0708, 0.0794, 0.0814, 0.0802, 0.0664, 
                                     0.0543, 0.0465, 0.042, 0.0387, 0.0358, 0.0331, 0.0312, 0.0291])
            }
        }, 
        'target': {
            'vel0': 400,
            'psi0': np.radians(0)            
        }}

In [16]:
q = QNetwork(1024)
q_target = q
q_new = run(q,
            q_target,
            learning_rate=0.01,
            gamma=0.98,
            buffer_max_size=500000,
            batch_size=16,
            target_update_interval=10,
            replay_buffer_start_size=20000,
            print_interval=10,
            n_episodes=3000,
            n_env_steps=500)

# of episode :10, avg score : 31.8, buffer size : 816, epsilon : 10.0%, Counter({'out of range': 11})
# of episode :20, avg score : 28.9, buffer size : 1559, epsilon : 9.9%, Counter({'out of range': 10})
# of episode :30, avg score : 28.5, buffer size : 2296, epsilon : 9.8%, Counter({'out of range': 10})
# of episode :40, avg score : 28.8, buffer size : 3036, epsilon : 9.8%, Counter({'out of range': 10})
# of episode :50, avg score : 28.5, buffer size : 3772, epsilon : 9.8%, Counter({'out of range': 10})
# of episode :60, avg score : 28.4, buffer size : 4507, epsilon : 9.7%, Counter({'out of range': 10})
# of episode :70, avg score : 28.7, buffer size : 5245, epsilon : 9.7%, Counter({'out of range': 10})
# of episode :80, avg score : 28.4, buffer size : 5981, epsilon : 9.6%, Counter({'out of range': 10})
# of episode :90, avg score : 29.1, buffer size : 6723, epsilon : 9.6%, Counter({'out of range': 10})
# of episode :100, avg score : 27.8, buffer size : 7452, epsilon : 9.5%, Counter({

# of episode :780, avg score : 45.8, buffer size : 72603, epsilon : 6.1%, Counter({'out of range': 9, 'hit': 1})
# of episode :790, avg score : 46.9, buffer size : 73532, epsilon : 6.1%, Counter({'out of range': 10})
# of episode :800, avg score : 84.3, buffer size : 74760, epsilon : 6.0%, Counter({'out of range': 10})
# of episode :810, avg score : 54.2, buffer size : 75863, epsilon : 6.0%, Counter({'out of range': 10})
# of episode :820, avg score : 47.6, buffer size : 76824, epsilon : 5.9%, Counter({'out of range': 10})
# of episode :830, avg score : 54.0, buffer size : 77811, epsilon : 5.9%, Counter({'out of range': 9, 'hit': 1})
# of episode :840, avg score : 39.2, buffer size : 78649, epsilon : 5.8%, Counter({'out of range': 10})
# of episode :850, avg score : 45.7, buffer size : 79575, epsilon : 5.8%, Counter({'out of range': 10})
# of episode :860, avg score : 53.8, buffer size : 80529, epsilon : 5.7%, Counter({'out of range': 9, 'hit': 1})
# of episode :870, avg score : 42.9, 

# of episode :1540, avg score : 38.7, buffer size : 147653, epsilon : 2.3%, Counter({'out of range': 10})
# of episode :1550, avg score : 82.5, buffer size : 149064, epsilon : 2.3%, Counter({'out of range': 10})
# of episode :1560, avg score : 34.7, buffer size : 149878, epsilon : 2.2%, Counter({'out of range': 10})
# of episode :1570, avg score : 61.8, buffer size : 151115, epsilon : 2.2%, Counter({'out of range': 10})
# of episode :1580, avg score : 70.6, buffer size : 152435, epsilon : 2.1%, Counter({'out of range': 10})
# of episode :1590, avg score : 44.5, buffer size : 153348, epsilon : 2.1%, Counter({'out of range': 10})
# of episode :1600, avg score : 66.0, buffer size : 154518, epsilon : 2.0%, Counter({'out of range': 10})
# of episode :1610, avg score : 40.1, buffer size : 155395, epsilon : 2.0%, Counter({'out of range': 10})
# of episode :1620, avg score : 36.1, buffer size : 156213, epsilon : 1.9%, Counter({'out of range': 10})
# of episode :1630, avg score : 104.2, buffer 

# of episode :2270, avg score : 72.7, buffer size : 234948, epsilon : 1.0%, Counter({'out of range': 7, 'hit': 3})
# of episode :2280, avg score : 69.1, buffer size : 235992, epsilon : 1.0%, Counter({'out of range': 9, 'hit': 1})
# of episode :2290, avg score : 86.3, buffer size : 237122, epsilon : 1.0%, Counter({'hit': 5, 'out of range': 5})
# of episode :2300, avg score : 85.6, buffer size : 238253, epsilon : 1.0%, Counter({'out of range': 5, 'hit': 5})
# of episode :2310, avg score : 95.0, buffer size : 239463, epsilon : 1.0%, Counter({'out of range': 6, 'hit': 4})
# of episode :2320, avg score : 85.6, buffer size : 240595, epsilon : 1.0%, Counter({'out of range': 5, 'hit': 5})
# of episode :2330, avg score : 104.7, buffer size : 241886, epsilon : 1.0%, Counter({'out of range': 6, 'hit': 4})
# of episode :2340, avg score : 59.7, buffer size : 242850, epsilon : 1.0%, Counter({'out of range': 9, 'hit': 1})
# of episode :2350, avg score : 70.2, buffer size : 243896, epsilon : 1.0%, Cou

# of episode :2990, avg score : 109.4, buffer size : 316979, epsilon : 1.0%, Counter({'hit': 7, 'out of range': 3})


In [None]:
q_new.state_dict()

In [17]:
run(q,
    q_target,
    learning_rate=0.05,
    gamma=0.99,
    buffer_max_size=500000,
    batch_size=16,
    target_update_interval=10,
    replay_buffer_start_size=50000,
    print_interval=10,
    n_episodes=1,
    n_env_steps=500,
    render=True, 
    epsilon0 = 0.01)

QNetwork(
  (fc1): Linear(in_features=8, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=3, bias=True)
)

In [None]:
env = AirCombat2D(opts)
print("Action space:",env.action_space)
print("Observation space:", env.observation_space.low)

In [None]:
env = gym.wrappers.TimeLimit(AirCombat2D(opts,altitude=15e3,target_autopilot=target_autopilot).unwrapped, max_episode_steps=500)
# проводим инициализацию и запоминаем начальное состояние
s = env.reset()

done = False

while not done:
    
    a = q_new.sample_action(torch.from_numpy(s).float(), 0.01)
    
    # выполняем действие, получаем s, r, done
    s, r, done, info = env.step(a)
    
    # визуализируем окружение
    env.render()

#     print(f"Время выполнения {it} шага: {round(time() - timeit, 3)} с", end="\r")

clear_output(True)

env.close()
if s[0] < 10:
    print("Успех!")
else:
    print(f"Не успех: {info}")