# Installs

In [None]:
pip install soccer-twos

In [None]:
pip install protobuf==3.20

In [None]:
!pip install gym==0.19.0
!pip install gym-unity==0.27.0
!pip install mlagents==0.27.0
!pip install mlagents-envs==0.27.0

# Imports e utils

In [None]:
import torch
import gym
import ray
from ray import tune
from soccer_twos import EnvType
import numpy as np
import os
from ray.rllib import MultiAgentEnv
from ray.rllib.agents.ppo import ppo
from ray.tune.logger import pretty_print
import soccer_twos
import collections
import random
from collections import deque
from gym_unity.envs import ActionFlattener
import numpy as np
from tqdm import tqdm

In [None]:
class RLLibWrapper(gym.core.Wrapper, MultiAgentEnv):
    """
    A RLLib wrapper so our env can inherit from MultiAgentEnv.
    """

    pass


def create_rllib_env(env_config: dict = {}):
    """
    Creates a RLLib environment and prepares it to be instantiated by Ray workers.
    Args:
        env_config: configuration for the environment.
            You may specify the following keys:
            - variation: one of soccer_twos.EnvType. Defaults to EnvType.multiagent_player.
            - opponent_policy: a Callable for your agent to train against. Defaults to a random policy.
    """
    if hasattr(env_config, "worker_index"):
        env_config["worker_id"] = (
            env_config.worker_index * env_config.get("num_envs_per_worker", 1)
            + env_config.vector_index
        )
    env = soccer_twos.make(**env_config)
    if "multiagent" in env_config and not env_config["multiagent"]:
        
        return env
    return RLLibWrapper(env)

In [None]:
tune.registry.register_env("Soccer", create_rllib_env)
temp_env = create_rllib_env({"variation": EnvType.multiagent_player})
obs_space = temp_env.observation_space
act_space = temp_env.action_space
temp_env.close()

In [None]:
def map_policy(agent_id):
    if agent_id == 0:
        return "policy_01" 
    else:
        return np.random.choice(["policy_01", "policy_02", "policy_03", "policy_04"],1,
                                p=[.8, .2/3, .2/3, .2/3])[0]

# Treinamento

In [None]:
config = ppo.DEFAULT_CONFIG.copy()
checkpoint = 'resultados/ppo_1'
confing_params = {
        "num_gpus": 1,
        "num_workers": 3,
        "num_envs_per_worker": 1,
        "log_level": "INFO",
        "framework": "torch",
        "ignore_worker_failures": True,
        "train_batch_size": 256,
        "lr": 1e-3,
        "lambda": .95,
        "gamma": .998,
        "entropy_coeff": 0.01,
        "kl_coeff": 1.0,
        "clip_param": 0.2,
        "num_sgd_iter": 10,
        "observation_filter": "NoFilter",  
        "vf_loss_coeff": 1e-4,    
                               
        "vf_clip_param": 1000000.0,
        "multiagent": {
            "policies": {
                "policy_01": (None, obs_space, act_space, {}),
                "policy_02": (None, obs_space, act_space, {}),
                "policy_03": (None, obs_space, act_space, {}),
                "policy_04": (None, obs_space, act_space, {})
            },
            "policy_mapping_fn": map_policy,
            
        },
        "env": "Soccer",
        "env_config": {
            "num_envs_per_worker": 3,
            "variation": EnvType.multiagent_player,
        },
    }

config.update(config_params)
trainer = ppo.PPOTrainer(
    env="Soccer",
    config=config
)

In [None]:
resultados = []
for i in range(int(1e3)): #1000 its
    resultados.append(trainer.train())
    print(pretty_print(trainer.train()))

    if i % 10 == 0:
        trainer.set_weights({"policy_04": trainer.get_weights(["policy_03"])["policy_03"],
                            "policy_03": trainer.get_weights(["policy_02"])["policy_02"],
                            "policy_02": trainer.get_weights(["policy_01"])["policy_01"],
                            })

    if i % 80 == 0:
        if not os.path.exists(checkpoint):
            os.makedirs(checkpoint)
        trainer.save_checkpoint(checkpoint)

In [None]:
print(resultados[0])#verificando se estava gerando certinho