In [None]:
import os
import sys

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(0, module_path)

from src import Interception2D
from src import make_escape_scenarios, plot_scenarios
import numpy as np
import os
import yaml
import matplotlib.pyplot as plt
from pathlib import Path
from copy import deepcopy

BASE_PATH = Path(os.getcwd())
for _ in range(len(BASE_PATH.parents) + 1):
    if os.path.basename(BASE_PATH) == 'SmartInterception':
        break
    BASE_PATH = BASE_PATH.parents[0]
LOG_PATH = os.path.join(BASE_PATH, 'files', 'logs')

os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

import torch
torch.cuda.get_device_name()

In [None]:
from stable_baselines3 import DDPG, SAC, TD3
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise, NormalActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, SubprocVecEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback

### 1. Настройка вывода Tensorboard

In [None]:
class SubprocVecEnvCallback(BaseCallback):

    def __init__(self, env, verbose=0):
        super().__init__(verbose)
        
        self.env = env
        self.buffer = {}
        
        n_envs = len(self.env.remotes)
        self.relations = np.repeat(0, n_envs)
        
        for i, _ in enumerate(self.relations):
            self.buffer[f'env_{i}'] = {
                'positive': 0,
                'negative': 0,
                'relation': 0
            }

    def _on_step(self) -> float:
        rewards = self.env.get_attr('reward')
        for i, value in enumerate(rewards):
            if value > 0:
                self.buffer[f'env_{i}']['positive'] += 1
            elif value < 0:
                self.buffer[f'env_{i}']['negative'] += 1
            else:
                pass
            try:
                self.buffer[f'env_{i}']['relation'] = self.buffer[f'env_{i}']['positive'] / self.buffer[f'env_{i}']['negative']
            except ZeroDivisionError:
                self.buffer[f'env_{i}']['relation'] = self.buffer[f'env_{i}']['positive']
            self.logger.record(f'reward_relation/env_{i}', self.buffer[f'env_{i}']['relation'])
        return True

### 2. Сценарии обучения

In [None]:
scenarios, params = make_escape_scenarios(
    n=10, 
    seed=None, 
    target_centered=True,
    d_min=15000.5,
    d_max=40000,
    q_min=0,
    q_max=50,
    eps_min=-20,
    eps_max=20
)
plot_scenarios(scenarios, params)

### 3. Векторизованное окружение

In [None]:
env = Interception2D(agent='target', bounds='bounds.yaml', scenarios=[])
nproc = 6

def make_env(env, seed):
    def _f():
        env_ = deepcopy(env)
        env_.seed(seed)
        return env_
    return _f

envs = [make_env(env, seed) for seed in range(nproc)]
envs = SubprocVecEnv(envs, start_method='spawn')

# check_env(envs)

In [None]:
policy_kwargs = dict(activation_fn=torch.nn.ELU,
                     net_arch=dict(pi=[64, 64], qf=[64, 64]))

model = SAC(
    'MlpPolicy', 
    envs, 
    verbose=1,  
    tensorboard_log=LOG_PATH, 
    device='cpu',
    buffer_size=10_000_000,
    learning_starts=30_000,
    batch_size=512,
    policy_kwargs=policy_kwargs,
    tau=0.2,
    train_freq=(1000, 'step')
)

In [None]:
model.env.set_attr('scenarios', scenarios)

In [None]:
model.learn(total_timesteps=10_000_000, callback=SubprocVecEnvCallback(model.env))

envs.close()

In [None]:
obs = env.reset()
done = False
while not done:
    action, _states = model.predict(obs)
    obs, rewards, done, _ = env.step(action)
env.post_render(renderer='notebook')
print(env.status)

In [None]:
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions))

policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=dict(pi=[16, 16, 16], qf=[16, 16, 16]))

model = DDPG(
    'MlpPolicy', 
    env, 
    verbose=4, 
    action_noise=action_noise, 
    tensorboard_log=LOG_PATH, 
    device='cuda',
    buffer_size=100_000_000,
    learning_starts=30_000,
    batch_size=64,
    policy_kwargs=policy_kwargs,
    tau=0.1
)

In [None]:
model.learn(
    total_timesteps=1_000_000,
)

In [None]:
def _make_modelname(name_const):
    filename = name_const + '.zip'
    i = 1
    while os.path.exists(os.path.join(BASE_PATH, 'models', filename)):
        filename = name_const + f'_{i}' + '.zip'
        i += 1
    return os.path.join(BASE_PATH, 'models', filename)

model.save(os.path.join(BASE_PATH, 'models', _make_modelname('SAC')))

In [None]:
model = DDPG.load(os.path.join(BASE_PATH, 'models', 'DDPG'), env=env)

In [None]:
print(env.info)