In [1]:
from modules.utils import TrainConfig, Logger, paint, get_env, simulate_episode
from modules.DQN import DQN
from modules.reward import Reward
from modules.preprocess import preprocess

import numpy as np
import pickle
from tqdm.auto import trange, tqdm

import torch
import torch.nn as nn
import math

import random

import warnings
warnings.filterwarnings('ignore')

import os

# 1 right
# 2 left
# 3 up
# 4 down

# general settings
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_ACTIONS = 5
N_PREDATORS = 5 # change to 5 !!!!!
N_MASKS = 5 # after preprocessing
MAP_SIZE = 40

# train settings
cfg = TrainConfig(
    description='some description',    
    max_steps_for_episode=300, # change to 300 !!!!!
    gamma = 0.9, # maybe better to set less !!!!!
    initial_steps=1000, # change to 100000 !!!!!
    steps=100_000,
    steps_per_update=3,
    steps_per_paint=250,
    steps_per_gif=500,
    buffer_size=10_000,
    batch_size=64,
    learning_rate=1e-3,
    eps_start=0.9, 
    eps_end=0.05,
    eps_decay=1000,
    tau=0.01, # the update rate of the target network, was 0.005
    reward_weights=dict(
        w_kill_prey=1.,
        w_kill_enemy=2.3,
        w_kill_bonus=1.3, 
        gamma_for_bonus_count=0.5,
        w_dummy_step=-0.23
    ),
    seed=1234 
)

  from .autonotebook import tqdm as notebook_tqdm


# TRAIN

In [4]:
from modules.reward import get_state_value

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


class RewardBasedModel:
    def __init__(self, reward_weights):
        self.reward_weights = reward_weights
        self.rewards = dict()

    def get_actions(self, processed_state, info):
        # initial_sv = get_state_value(cfg.reward_weights, processed_state)
        rewards = []
        for name, x, y in [("right", 21, 20), ("left", 19, 20), ("up", 20, 19), ("down", 20, 21)]:
            sv = get_state_value(cfg.reward_weights, processed_state, info, (x, y))
            reward = sv + self.__get_kills_value(x, y, processed_state)
            reward = reward * self.__is_cell_empty(x, y, processed_state)
            rewards.append(reward)
            self.rewards[name] = reward
        rewards = np.stack(rewards)
        return np.argmax(rewards, axis=0) + 1

    def __get_weight_from_coordinates(self, x, y, processed_state):
        _, preys_mask, enemies_mask, bonuses_mask, _ = processed_state

        if preys_mask[y, x] == 1:
            return self.reward_weights["w_kill_prey"]
        if enemies_mask[y, x] == 1:
            return self.reward_weights["w_kill_enemy"]
        if bonuses_mask[y, x] == 1:
            return self.reward_weights["w_kill_bonus"]

        return 0

    def __get_kills_value(self, x, y, processed_state):
        out = []
        for pr_st in processed_state:
            out.append(self.__get_weight_from_coordinates(x, y, pr_st))
        return np.stack(out)

    def __is_cell_empty(self, x, y, processed_state):
        out = []
        for pr_st in processed_state:
            stones_mask, *_ = pr_st
            out.append(stones_mask[y, x] == 0)
        return np.array(out)


def evaluate(model, n_episodes=3):
    results = []
    for step, d in enumerate(np.linspace(0, 1, n_episodes)):  
        print('\r' + str(step), end='')      
        results.append(simulate_episode(model, d, N_PREDATORS, cfg, '00000.gif', render_gif=False))
    return sum(results) / len(results)

# ys = []
# xs = np.linspace(-0.15, -0.35, 8)
# for i, w_dummy_step in tqdm(enumerate(xs)):
#     weights = cfg.reward_weights.copy()
#     weights["w_dummy_step"] = w_dummy_step
#     model = RewardBasedModel(weights)
#     ys.append(evaluate(model))
print(evaluate(RewardBasedModel(cfg.reward_weights), 1))

00.4974958263772955


In [3]:
from matplotlib import pyplot as plt

plt.plot(xs, ys)
plt.xlabel('w_dummy_step')
plt.ylabel('score / (bot_score + score)')
plt.title('score / (bot_score + score)')
plt.axis(ymin=0, ymax=1)
plt.axhline(0.5, color='red', linestyle='--')
plt.show()

NameError: name 'xs' is not defined

In [None]:
def train():
    model = DQN(
        n_masks=N_MASKS,
        n_actions=N_ACTIONS,
        n_predators=N_PREDATORS,
        map_size=MAP_SIZE,
        device=DEVICE,
        config=cfg
    ).to(DEVICE).train()

    logger = Logger(cfg)

    # INITIAL STEPS
    env = get_env(n_predators=N_PREDATORS, difficulty=0, step_limit=cfg.max_steps_for_episode)
    state, info = env.reset()    
    processed_state = preprocess(state, info)
    r = Reward(n_predators=N_PREDATORS, **cfg.reward_weights)
    for _ in trange(cfg.initial_steps):
        actions = model.get_actions(processed_state, random=True)
        next_state, done, next_info = env.step(actions)
        next_processed_state = preprocess(next_state, next_info)
        reward = r(processed_state, info, next_processed_state, next_info)
        model.consume_transition(processed_state, actions, next_processed_state, reward, done)
        state, info = (next_state, next_info) if not done else env.reset()        
        processed_state = preprocess(state, info)
        

    # with open(f'pre_calc_buffer_simple_10000.pkl', 'wb') as handle:
    #     pickle.dump(model.buffer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # TRAINING
    # with open('pre_calc_buffer_simple_10000.pkl', 'rb') as handle:
    #     model.buffer = pickle.load(handle)

    env = get_env(n_predators=N_PREDATORS, difficulty=0, step_limit=cfg.max_steps_for_episode)
    state, info = env.reset()
    processed_state = preprocess(state, info)
    r = Reward(n_predators=N_PREDATORS, **cfg.reward_weights)
    score_difference = None

    try:
        for _ in trange(cfg.steps):
            # ========== step ==========================================================
            eps_threshold = cfg.eps_end + (cfg.eps_start - cfg.eps_end) * \
                math.exp(-1. * model.steps_done / cfg.eps_decay)
            actions = model.get_actions(processed_state, random=(random.random() < eps_threshold))
            next_state, done, next_info = env.step(actions)
            next_processed_state = preprocess(next_state, next_info)
            reward = r(processed_state, info, next_processed_state, next_info)
            model.consume_transition(processed_state, actions, next_processed_state, reward, done)

            if done:
                # not just reset in oreder to implement changes of map
                env = get_env(n_predators=N_PREDATORS, difficulty=model.steps_done/cfg.steps,
                              step_limit=cfg.max_steps_for_episode)
                state, info = env.reset()
            else:
                state, info = next_state, next_info

            processed_state = preprocess(state, info)

            # ========== updates =======================================================
            if model.steps_done % cfg.steps_per_update == 0:
                reward_batch, loss = model.update_policy_network()

            model.soft_update_target_network()  # each step

            if model.steps_done % cfg.steps_per_paint == 0 and model.steps_done > 0:
                paint(logger, [['reward', 'reward_batch', 'loss'], ['score_difference']])

            if model.steps_done % cfg.steps_per_gif == 0 and model.steps_done > 0:
                os.makedirs(logger.curr_subfolder_path + '/gifs', exist_ok=True)
                path = f'{logger.curr_subfolder_path}/gifs/{model.steps_done}_steps.gif'
                score_difference = simulate_episode_and_create_gif(
                    model, model.steps_done/cfg.steps, N_PREDATORS, cfg, path)

            model.steps_done += 1

            # ========== logs ==========================================================
            logger.add('eps', eps_threshold)
            logger.add('reward', reward.mean())
            logger.add('reward_batch', reward_batch)
            logger.add('loss', loss)
            logger.add('score_difference', score_difference)

    except KeyboardInterrupt:
        print('Training interrupted')

    finally:
        logger.save()
        model.save(logger.curr_subfolder_path + f'/model_steps_{model.steps_done}.pt')
        return model


model = train()

TypeError: __init__() got an unexpected keyword argument 'w_kill_prey'

# TODO

0. разобраться с гитом
0. ивалуейт. сделать 5 итераций, степс_пер_криейт_гиф увеличить в 5 раз
0. сделать реворд-2
0. зафорсить оптимальные действия в инишал буффер ??
0. добавить шедулер ??
2. добавить разделение на группы в пейнтр
4. добавить MA to paint
6. если заработает бейзлайн, подумать как добавить возм-ть выучить "бфс"
7. способ ускорить реворд - сохранять последний СВ
8. изменить отрисовку на фиолетовую
9. добавить учёт разницы в счете ?..

# список костылей

2. плохо проработан момент съедения жертвы
4. костыль RewardBasedModel: если идёт в стену, то может получить положительный реворд

# заметки

1. слишком высокий вес енеми заставляет за ним гоняться и это выходит тупо. 
2. в случае если кого-то кушаю, реворд должен учитывать дельтуСВ, но при этом она НЕ должна быть меньше 0
3. отслеживать респавн через инфо, учитывать в реворде
4. сделать чтобы после n-го шага считался более дешевый в вычислении реворд
5. сделать удорожание каждого последующего dummy_step'a
6. (возможно) сделать рассеивание местоположения далёких жертв (что это даст?)

In [None]:
model = DQN(
    n_masks=N_MASKS,
    n_actions=N_ACTIONS,
    n_predators=N_PREDATORS,
    map_size=MAP_SIZE,
    device=DEVICE,
    config=cfg
).to(DEVICE).train()

with open('pre_calculated_buffer_10000.pkl', 'rb') as handle:
    buffer = pickle.load(handle)

FileNotFoundError: [Errno 2] No such file or directory: 'pre_calculated_buffer_10000.pkl'

In [None]:
        #     rews.append(reward)
        # rews = np.stack(rews)
        # sft = softmax(rews)
        # actions = []
        # for i in range(5):
        #     action = np.random.choice(np.arange(4), p=sft[:, i]) + 1
        #     actions.append(action)
        # return actions