In [1]:
#Imports

from marketsai.markets.diff_demand import DiffDemandDiscrete
from marketsai.economies.economies import Economy

#import ray

from ray import tune, shutdown, init
from ray.tune.registry import register_env
from ray.rllib.agents.a3c.a2c import A2CTrainer
from ray.rllib.agents.dqn.dqn import DQNTrainer
from ray.tune.integration.mlflow import MLflowLoggerCallback
from ray.rllib.utils.schedules.exponential_schedule import ExponentialSchedule

import random
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# STEP 0: Inititialize ray

NUM_CPUS = 14
shutdown()
init(num_cpus=NUM_CPUS, logging_level=logging.ERROR)

{'node_ip_address': '192.168.1.202',
 'raylet_ip_address': '192.168.1.202',
 'redis_address': '192.168.1.202:29974',
 'object_store_address': '/tmp/ray/session_2021-04-08_10-58-19_259942_8782/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-04-08_10-58-19_259942_8782/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-04-08_10-58-19_259942_8782',
 'metrics_export_port': 59454,
 'node_id': '0522fa44f4e05c55e0dfc1dc6fd7f94c1ea823d181e6f4e6d15ae775'}

In [3]:
# STEP 1: register environment

register_env("economy", Economy)
env = Economy()
policy_ids = ["policy_{}".format(i) for i in range(env.n_agents)]

In [4]:
# STEP 2: Experiment configuration

MAX_STEPS = 10 * 1000
PRICE_BAND_WIDE = 0.1
LOWER_PRICE = 1.47 - PRICE_BAND_WIDE
HIGHER_PRICE = 1.93 + PRICE_BAND_WIDE
DEC_RATE = math.e ** (-4 * 10 ** (-6))
DEC_RATE_HIGH = math.e ** (-4 * 10 ** (-6) * 4)
mkt_config = {
    "lower_price": [LOWER_PRICE for i in range(env.n_agents)],
    "higher_price": [HIGHER_PRICE for i in range(env.n_agents)],
}
env_config = {"markets_dict": {"market_0": (DiffDemandDiscrete, mkt_config), "market_1": (DiffDemandDiscrete, mkt_config)}}

exploration_config = {
    "type": "EpsilonGreedy",
    "epsilon_schedule": ExponentialSchedule(
        schedule_timesteps=1,
        framework=None,
        initial_p=1,
        decay_rate=DEC_RATE,
    ),
}

config = {
    "gamma": 0.95,
    "lr": 0.15,
    "env": "economy",
    "exploration_config": exploration_config,
    "env_config": env_config,
    "horizon": 100,
    "soft_horizon": True,
    "no_done_at_end": True,
    "multiagent": {
        "policies": {
            policy_ids[i]: (
                None,
                env.observation_space["agent_{}".format(i)],
                env.action_space["agent_{}".format(i)],
                {},
            )
            for i in range(env.n_agents)
        },
        "policy_mapping_fn": (lambda agent_id: policy_ids[int(agent_id.split("_")[1])]),
    },
    "framework": "torch",
    "num_workers": NUM_CPUS - 1,
    "num_gpus": 0,
    "log_level": "ERROR"
}

stop = {"info/num_steps_trained": MAX_STEPS}

In [None]:
print(np.sum(env.observation_space["agent_0"].nvec))

In [5]:
#Step 3: Experiments

exp_name = "econ_PPO_TESApril8"
results = tune.run(
    "PPO",
    name=exp_name,
    config=config,
    checkpoint_freq=250,
    checkpoint_at_end=True,
    stop=stop,
    metric="episode_reward_mean",
    mode="max",
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
)

best_checkpoint = results.best_checkpoint
print("Best checkpont:", best_checkpoint)

Trial name,status,loc
PPO_economy_e4bbe_00000,RUNNING,


[2m[36m(pid=9957)[0m Instructions for updating:
[2m[36m(pid=9957)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9957)[0m 2021-04-08 10:58:37,964	INFO trainer.py:641 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=9954)[0m Instructions for updating:
[2m[36m(pid=9954)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9950)[0m Instructions for updating:
[2m[36m(pid=9950)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9955)[0m Instructions for updating:
[2m[36m(pid=9955)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9952)[0m Instructions for updating:
[2m[36m(pid=9952)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9956)[0m Instructions for updating:
[2m[36m(pid=9956)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9958

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_economy_e4bbe_00000,RUNNING,192.168.1.202:9957,1,21.2742,5200,108.136,110.42,105.488,100


Result for PPO_economy_e4bbe_00000:
  custom_metrics: {}
  date: 2021-04-08_10-59-29
  done: true
  episode_len_mean: 100.0
  episode_reward_max: 111.10751351376987
  episode_reward_mean: 107.93730209795307
  episode_reward_min: 103.02265823373419
  episodes_this_iter: 52
  episodes_total: 104
  experiment_id: b52ec2d5c34f426a83fb5563d0a0440d
  hostname: Matiass-MBP.fios-router.home
  info:
    learner:
      policy_0:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.30000000000000004
        cur_lr: 0.15000000000000002
        entropy: 0.2422735836993485
        entropy_coeff: 0.0
        kl: .inf
        policy_loss: 0.2848011527846499
        total_loss: .inf
        vf_explained_var: 0.0
        vf_loss: 1.8295596129283673
      policy_1:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.30000000000000004
        cur_lr: 0.15000000000000002
        entropy: 0.04782537410099438
        entropy_coeff: 0.0
        kl: .inf
        policy_loss: 0.2662815288072679
        tot

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_economy_e4bbe_00000,RUNNING,192.168.1.202:9957,2,47.2429,10400,107.937,111.108,103.023,100


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_economy_e4bbe_00000,TERMINATED,,2,47.2429,10400,107.937,111.108,103.023,100


Best checkpont: /Users/matiascovarrubias/ray_results/econ_PPO_TESApril8/PPO_economy_e4bbe_00000_0_2021-04-08_10-58-35/checkpoint_2/checkpoint-2


In [None]:
#Step 4: Evaluation

config["evaluation_config"] = {"explore": False}
trained_trainer = DQNTrainer(config=config)
trained_trainer.restore(best_checkpoint)
price_agent0_list = []
reward_agent0_list = []
price_agent1_list = []
reward_agent1_list = []
obs, reward, done, info = env.step({"agent_0": 1, "agent_1": 11})
for i in range(500):

    action_agent0 = trained_trainer.compute_action(obs["agent_0"], policy_id="policy_0")
    action_agent1 = trained_trainer.compute_action(obs["agent_1"], policy_id="policy_1")
    obs, reward, done, info = env.step(
        {"agent_0": action_agent0, "agent_1": action_agent1}
    )
    price_agent0_list.append(info["agent_0"])
    reward_agent0_list.append(reward["agent_0"])
    price_agent1_list.append(info["agent_1"])
    reward_agent1_list.append(reward["agent_1"])

plt.ion()

plt.plot(price_agent0_list)
plt.show()
plt.plot(price_agent1_list)
plt.show()

IRresults = {
    "Profits Agent 0": reward_agent0_list,
    "Profits Agent 1": reward_agent1_list,
    "Price Agent 0": price_agent0_list,
    "Price Agent 1": price_agent1_list,
}
df_IR = pd.DataFrame(IRresults)
df_IR.to_csv("collusion_IR_DQN.csv")