## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
sys.path.append(f'{os.getcwd()}/../')

In [None]:
from typing import Union
import gymnasium as gym
from decimal import Decimal

import numpy as np
import pandas as pd
from numpy.random import Generator
from gymnasium.spaces import Box, Discrete
from pettingzoo import AECEnv
import matplotlib

from src.wrapper import RestrictionWrapper
from src.restrictors import Restrictor, RestrictorActionSpace, IntervalUnionActionSpace
from src.restrictions import BucketSpace, IntervalUnionRestriction

from examples.envs.nfg import NFGEnvironment
from examples.utils import play

## Definition of the Cournot Game

In [None]:
maximum_price = 120
cost = 12

observation_spaces = {'player_0': Box(0, maximum_price), 'player_1': Box(0, maximum_price)}
action_spaces = {'player_0': Box(0, maximum_price), 'player_1': Box(0, maximum_price)}
utilities = {
    'player_0': (lambda actions: -actions['player_0'] ** 2 - actions['player_0'] * actions['player_1'] + (maximum_price - cost) * actions['player_0']), 
    'player_1': (lambda actions: -actions['player_1'] ** 2 - actions['player_0'] * actions['player_1'] + (maximum_price - cost) * actions['player_1'])}

env = NFGEnvironment(observation_spaces, action_spaces, utilities, number_of_steps=100, render_mode='human')

In [None]:
# Players always choose the best response to the opponent's action

def unrestricted_agent_policy(observation):
    opponent_action = observation[0]
    if opponent_action is None:
        return np.random.randint(0, 121)
    else:
        return np.clip(54 - opponent_action / 2, 0, 120)

## Test: Play without restrictions

In [None]:
policies = {'player_0': unrestricted_agent_policy, 'player_1': unrestricted_agent_policy}
play(env, policies, max_iter=20)

## Self-learning restrictions

In [None]:
# Players always choose the best response to the opponent's action, given the restriction
class CournotAgent:
    def __init__(self, _lambda: float) -> None:
        self._lambda = _lambda

    def act(self, observation):
        observation, restriction = observation['observation'], observation['restriction']
        opponent_action = observation[0]
        if opponent_action is None:
            return np.random.uniform(0, maximum_price)
        else:
            unrestricted_best_response = (self._lambda - opponent_action) / 2
            if restriction.contains(unrestricted_best_response):
                return unrestricted_best_response
            else:
                [ll, lu], _ = restriction.last_interval_before_or_within(unrestricted_best_response)
                [ul, uu], _ = restriction.first_interval_after_or_within(unrestricted_best_response)

                if ll is None:
                    return float(ul)
                elif ul is None:
                    return float(lu)
                else:
                    ll, lu, ul, uu = float(ll), float(lu), float(ul), float(uu)

                    return lu if (unrestricted_best_response - lu) < 2 * (ul - unrestricted_best_response) else ul

In [None]:
class CournotRestrictor(Restrictor):
    def __init__(self, observation_space, action_space) -> None:
        super().__init__(observation_space, action_space)

        self.previous_observation = None
        self.restriction = IntervalUnionRestriction(self.action_space.base_space)
        self.has_restricted = False

    def preprocess_observation(self, env: AECEnv):
        return np.array(list(env.state().values()), dtype=float)
    
    def act(self, observation: gym.Space) -> RestrictorActionSpace:
        if not np.isnan(observation).any():
            if not self.has_restricted and self.previous_observation is not None:
                if np.allclose(observation, self.previous_observation, atol=0.001):
                    estimated_lambda = 3 / 2 * observation.sum()
                    self.restriction.remove(estimated_lambda / 4, estimated_lambda / 2)
                    self.has_restricted = True

            self.previous_observation = observation

        return self.restriction

In [None]:
agents = [CournotAgent(maximum_price - cost), CournotAgent(maximum_price - cost)]
restrictor = CournotRestrictor(Box(0, maximum_price, shape=(2, )), IntervalUnionActionSpace(Box(0, maximum_price)))
wrapper = RestrictionWrapper(env, restrictor)

policies = {'player_0': agents[0].act, 'player_1': agents[1].act, 'restrictor_0': restrictor.act}
trajectory = play(wrapper, policies, max_iter=100, render_mode=None, record_trajectory=True)

In [None]:
trajectory.groupby('agent')['reward'].plot()