In [None]:
#Imports

from marketsai.markets.diff_demand import DiffDemand
from marketsai.economies.economies import Economy

#import ray

from ray import tune, shutdown, init
from ray.tune.registry import register_env
from ray.rllib.agents.a3c.a2c import A2CTrainer
from ray.rllib.agents.dqn.dqn import DQNTrainer
from ray.tune.integration.mlflow import MLflowLoggerCallback
from ray.rllib.utils.schedules.exponential_schedule import ExponentialSchedule

import random
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging

In [None]:
# STEP 0: Inititialize ray

NUM_CPUS = 14
shutdown()
init(num_cpus=NUM_CPUS, logging_level=logging.ERROR)

In [None]:
# STEP 1: register environment

register_env("economy", Economy)
env = Economy()
policy_ids = ["policy_{}".format(i) for i in range(env.n_agents)]

In [None]:
# STEP 2: Experiment configuration

MAX_STEPS = 2000 * 1000
PRICE_BAND_WIDE = 0.1
LOWER_PRICE = 1.47 - PRICE_BAND_WIDE
HIGHER_PRICE = 1.93 + PRICE_BAND_WIDE
DEC_RATE = math.e ** (-4 * 10 ** (-6))
DEC_RATE_HIGH = math.e ** (-4 * 10 ** (-6) * 4)
mkt_config = {
    "lower_price": [LOWER_PRICE for i in range(env.n_agents)],
    "higher_price": [HIGHER_PRICE for i in range(env.n_agents)],
}
env_config = {"markets_dict": {"market_0": (DiffDemand, mkt_config), "market_1": (DiffDemand, mkt_config)}}

exploration_config = {
    "type": "EpsilonGreedy",
    "epsilon_schedule": ExponentialSchedule(
        schedule_timesteps=1,
        framework=None,
        initial_p=1,
        decay_rate=DEC_RATE,
    ),
}

config = {
    "gamma": 0.95,
    "lr": 0.15,
    "env": "economy",
    "exploration_config": exploration_config,
    "env_config": env_config,
    "horizon": 100,
    "soft_horizon": True,
    "no_done_at_end": True,
    "multiagent": {
        "policies": {
            policy_ids[i]: (
                None,
                env.observation_space["agent_{}".format(i)],
                env.action_space["agent_{}".format(i)],
                {},
            )
            for i in range(env.n_agents)
        },
        "policy_mapping_fn": (lambda agent_id: policy_ids[int(agent_id.split("_")[1])]),
    },
    "framework": "torch",
    "num_workers": NUM_CPUS - 1,
    "num_gpus": 0,
    "log_level": "ERROR",
    #"normalize_actions": False
}

stop = {"info/num_steps_trained": MAX_STEPS}

In [None]:
print(np.sum(env.observation_space["agent_0"].nvec))

In [None]:
#Step 3: Experiments

exp_name = "econ_PG_April8"
results = tune.run(
    "PG",
    name=exp_name,
    config=config,
    checkpoint_freq=250,
    checkpoint_at_end=True,
    stop=stop,
    metric="episode_reward_mean",
    mode="max",
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
)

best_checkpoint = results.best_checkpoint
print("Best checkpont:", best_checkpoint)

In [None]:
exp_name = "econ_PPO_April8"
results = tune.run(
    "PPO",
    name=exp_name,
    config=config,
    checkpoint_freq=250,
    checkpoint_at_end=True,
    stop=stop,
    metric="episode_reward_mean",
    mode="max",
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
)

best_checkpoint = results.best_checkpoint
print("Best checkpont:", best_checkpoint)

In [None]:
exp_name = "econ_APPO_April8"
results = tune.run(
    "APPO",
    name=exp_name,
    config=config,
    checkpoint_freq=250,
    checkpoint_at_end=True,
    stop=stop,
    metric="episode_reward_mean",
    mode="max",
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
)

best_checkpoint = results.best_checkpoint
print("Best checkpont:", best_checkpoint)

In [None]:
exp_name = "econ_IMPALA_April8"
results = tune.run(
    "IMPALA",
    name=exp_name,
    config=config,
    checkpoint_freq=250,
    checkpoint_at_end=True,
    stop=stop,
    metric="episode_reward_mean",
    mode="max",
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
)

best_checkpoint = results.best_checkpoint
print("Best checkpont:", best_checkpoint)

In [None]:
#Step 4: Evaluation

config["evaluation_config"] = {"explore": False}
trained_trainer = DQNTrainer(config=config)
trained_trainer.restore(best_checkpoint)
price_agent0_list = []
reward_agent0_list = []
price_agent1_list = []
reward_agent1_list = []
obs, reward, done, info = env.step({"agent_0": 1, "agent_1": 11})
for i in range(500):

    action_agent0 = trained_trainer.compute_action(obs["agent_0"], policy_id="policy_0")
    action_agent1 = trained_trainer.compute_action(obs["agent_1"], policy_id="policy_1")
    obs, reward, done, info = env.step(
        {"agent_0": action_agent0, "agent_1": action_agent1}
    )
    price_agent0_list.append(info["agent_0"])
    reward_agent0_list.append(reward["agent_0"])
    price_agent1_list.append(info["agent_1"])
    reward_agent1_list.append(reward["agent_1"])

plt.ion()

plt.plot(price_agent0_list)
plt.show()
plt.plot(price_agent1_list)
plt.show()

IRresults = {
    "Profits Agent 0": reward_agent0_list,
    "Profits Agent 1": reward_agent1_list,
    "Price Agent 0": price_agent0_list,
    "Price Agent 1": price_agent1_list,
}
df_IR = pd.DataFrame(IRresults)
df_IR.to_csv("collusion_IR_DQN.csv")