In [1]:
import numpy as np
import torch
import gym
from torch import nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from game.pricing_env import PricingGameEnv
from data.generator import SimpleDemandGenerator
from gym.wrappers import FlattenObservation

In [2]:
def mish(input):
    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):
    def __init__(self): super().__init__()
    def forward(self, input): return mish(input)

In [3]:
# helper function to convert numpy arrays to tensors
def t(x):
    x = np.array(x) if not isinstance(x, np.ndarray) else x
    return torch.from_numpy(x).float()

In [4]:
log_std = 3.0

class Actor(nn.Module):
    def __init__(self, state_dim, n_actions, activation=nn.Tanh, hidden_size = 256):
        super().__init__()
        
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            activation(),
            nn.Linear(hidden_size, hidden_size),
            activation(),
            nn.Linear(hidden_size, n_actions)
        )
        
#         logstds_param = nn.Parameter(torch.full((n_actions,), log_std))
#         self.register_parameter("logstds", logstds_param)
    
#     def forward(self, X):
#         means = torch.sigmoid(self.model(X)) * 70
#         stds = torch.clamp(self.logstds.exp(), 3, 30)
        
#         return torch.distributions.Normal(means, stds)

        logstds_param = nn.Parameter(torch.full((n_actions,), 0.1))
        self.register_parameter("logstds", logstds_param)
    
    def forward(self, X):
        means = self.model(X)
        stds = torch.clamp(self.logstds.exp(), 1e-3, 50)
        
        return torch.distributions.Normal(means, stds)

In [5]:

# def __init__(self, state_dim, hidden_dim, init_w=3e-3):
#         super(ValueNetwork, self).__init__()
        
#         self.linear1 = nn.Linear(state_dim, hidden_dim)
#         self.linear2 = nn.Linear(hidden_dim, hidden_dim)
#         self.linear3 = nn.Linear(hidden_dim, 1)
        
#         self.linear3.weight.data.uniform_(-init_w, init_w)
#         self.linear3.bias.data.uniform_(-init_w, init_w)
        
#     def forward(self, state):
#         x = F.relu(self.linear1(state))
#         x = F.relu(self.linear2(x))
#         x = self.linear3(x)
#         return x
    
def init_weights(layer,  init_w=3e-3):
    if type(layer) == nn.Linear:
        nn.init.uniform_(layer.weight, -init_w, init_w)
        nn.init.uniform_(layer.bias, -init_w, init_w)
    

## Critic module
class Critic(nn.Module):
    def __init__(self, state_dim, activation=nn.ReLU, hidden_size = 512):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            activation(),
            nn.Linear(hidden_size, hidden_size),
            activation(),
            nn.Linear(hidden_size, 1),
        )
        self.model.apply(init_weights) 
    
    def forward(self, X):
        return self.model(X)

In [6]:
def discounted_rewards(rewards, dones, gamma):
    ret = 0
    discounted = []
    for reward, done in zip(rewards[::-1], dones[::-1]):
        ret = reward + ret * gamma * (1-done)
        discounted.append(ret)
    
    return discounted[::-1]

In [7]:
def process_memory(memory, gamma=0.99, discount_rewards=True):
    actions = []
    states = []
    next_states = []
    rewards = []
    dones = []

    for action, reward, state, next_state, done in memory:
        actions.append(action)
        rewards.append(reward)
        states.append(state)
        next_states.append(next_state)
        dones.append(done)
    
    if discount_rewards:
        if False and dones[-1] == 0:
            rewards = discounted_rewards(rewards + [last_value], dones + [0], gamma)[:-1]
        else:
            rewards = discounted_rewards(rewards, dones, gamma)

    actions = t(actions).view(-1, 1)
    states = t(states)
    next_states = t(next_states)
    rewards = t(rewards).view(-1, 1)
    dones = t(dones).view(-1, 1)
    return actions, rewards, states, next_states, dones

def clip_grad_norm_(module, max_grad_norm):
    nn.utils.clip_grad_norm_([p for g in module.param_groups for p in g["params"]], max_grad_norm)

In [8]:
class A2CLearner():
    def __init__(self, actor, critic, gamma=0.9, entropy_beta=0,
                 actor_lr=4e-4, critic_lr=4e-3, max_grad_norm=0.5):
        self.gamma = gamma
        self.max_grad_norm = max_grad_norm
        self.actor = actor
        self.critic = critic
        self.entropy_beta = entropy_beta
        self.actor_optim = torch.optim.Adam(actor.parameters(), lr=actor_lr)
        self.critic_optim = torch.optim.Adam(critic.parameters(), lr=critic_lr)
    
    def learn(self, memory, steps, discount_rewards=True):
        actions, rewards, states, next_states, dones = process_memory(memory, self.gamma, discount_rewards)

        if discount_rewards:
            td_target = rewards
        else:
            td_target = rewards + self.gamma*critic(next_states)*(1-dones)
        value = critic(states)
        advantage = td_target - value
        
        # actor
        norm_dists = self.actor(states)
        actions = actions.reshape(states.shape[0], -1)

        logs_probs = norm_dists.log_prob(actions)
        entropy = norm_dists.entropy().mean()
        
        actor_loss = (-logs_probs*advantage.detach()).mean() - entropy*self.entropy_beta
        self.actor_optim.zero_grad()
        actor_loss.backward()
        
        clip_grad_norm_(self.actor_optim, self.max_grad_norm)
        writer.add_histogram("gradients/actor",
                             torch.cat([p.grad.view(-1) for p in self.actor.parameters()]), global_step=steps)
        writer.add_histogram("parameters/actor",
                             torch.cat([p.data.view(-1) for p in self.actor.parameters()]), global_step=steps)
        self.actor_optim.step()

        # critic
        critic_loss = F.mse_loss(td_target, value)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        clip_grad_norm_(self.critic_optim, self.max_grad_norm)
        writer.add_histogram("gradients/critic",
                             torch.cat([p.grad.view(-1) for p in self.critic.parameters()]), global_step=steps)
        writer.add_histogram("parameters/critic",
                             torch.cat([p.data.view(-1) for p in self.critic.parameters()]), global_step=steps)
        self.critic_optim.step()
        
        # reports
        writer.add_scalar("losses/log_probs", -logs_probs.mean(), global_step=steps)
        writer.add_scalar("losses/entropy", entropy, global_step=steps) 
        writer.add_scalar("losses/entropy_beta", self.entropy_beta, global_step=steps) 
        writer.add_scalar("losses/actor", actor_loss, global_step=steps)
        writer.add_scalar("losses/advantage", advantage.mean(), global_step=steps)
        writer.add_scalar("losses/critic", critic_loss, global_step=steps)

In [9]:
class Runner():
    def __init__(self, env):
        self.env = env
        self.state = None
        self.done = True
        self.steps = 0
        self.episode_reward = 0
        self.episode_rewards = []
    
    def reset(self):
        self.episode_reward = 0
        self.done = False
        self.state = self.env.reset()
    
    def run(self, max_steps, memory=None):
        if not memory: memory = []
        
        for i in range(max_steps):
            if self.done: self.reset()
            dists = actor(t(self.state))
            actions = dists.sample().detach().data.numpy()
            actions_clipped = np.clip(actions, self.env.action_space.low.min(), env.action_space.high.max())

            next_state, reward, self.done, info = self.env.step(actions_clipped)
            memory.append((actions, reward, self.state, next_state, self.done))

            self.state = next_state
            self.steps += 1
            self.episode_reward += reward
            
            if self.done:
                self.episode_rewards.append(self.episode_reward)
                if len(self.episode_rewards) % 50 == 0:
                    print("episode:", len(self.episode_rewards), ", episode reward:", self.episode_reward)
                writer.add_scalar("episode_reward", self.episode_reward, global_step=self.steps)
                    
        
        return memory

In [14]:
from data.generator import DemandGenerator
from game.state import GameState
from ast import Dict
import gym
from gym import spaces
import numpy as np
class ReducedPricingGameEnv(PricingGameEnv):
     def __init__(
        self,
        demand_generator: DemandGenerator,
        num_products: int,
        num_weeks: int = 52,
        min_price: float = 10.0,
        max_price: float = 200.0,
        min_cogs: float = 0.6,
        max_cogs: float = 0.9,
        max_initial_stock: int = 2000,
        profit_lack_penalty: float = 10.0,
        target_profit_ratio: float = 0.05,
    ):
        super().__init__(demand_generator,  # DemandGenerator object
            num_products,  # int
            num_weeks,  # int
            min_price,  # float
            max_price,  # float
            min_cogs,  # float
            max_cogs,  # float
            max_initial_stock,  # int
            profit_lack_penalty,  # float
            target_profit_ratio) 
        self.observation_space = spaces.Dict(
            {
                "cw": spaces.Box(low=0, high=200, shape=(1,), dtype=np.int32),
                "sales": spaces.Box(low=0, high=np.inf, shape=(self.num_products,), dtype=np.float32),
                #"black_prices": spaces.Box(low=0, high=np.inf, shape=(self.num_products,), dtype=np.float32),
                #"residual_value": spaces.Box(low=0, high=np.inf, shape=(self.num_products,), dtype=np.float32),
#                 "article_season_end": spaces.Box(
#                     low=0, high=self.max_initial_stock, shape=(self.num_products,), dtype=np.int32
#                 ),
                "stocks": spaces.Box(low=0, high=self.max_initial_stock, shape=(self.num_products,), dtype=np.int32),
                "revenues": spaces.Box(low=0, high=np.inf, shape=(self.num_products,), dtype=np.float32),
                #"profits": spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_products,), dtype=np.float32),
                "discounts": spaces.Box(low=0, high=100, shape=(self.num_products,), dtype=np.float32),
                #"online_status": spaces.Box(low=0, high=1, shape=(self.num_products,), dtype=np.int32),


            }
        )
        
        def _get_observation(self) -> Dict:
            observations = {
                "cw": self.current_cw,
                "sales": self.sales[-1],
                #"black_prices": self.black_prices,
                #"residual_value": self.residual_value,
                #"article_season_end": self.article_season_end,
                "stocks": self.stocks[-1],
                "revenues": self.revenues[-1],
                #"profits": self.profits[-1],
                "discounts": self.discounts[-1],
                #"online_status": self.online_status[-1]
            }

            return observations


In [15]:
n = 30
seed = 12315
generator = SimpleDemandGenerator()


In [16]:
env = FlattenObservation(ReducedPricingGameEnv(generator, n))
writer = SummaryWriter("runs/mish_activation",purge_step=0)

# config
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
actor = Actor(state_dim, n_actions,activation=Mish, hidden_size=128)
critic = Critic(state_dim,activation=nn.ReLU, hidden_size=128)

learner = A2CLearner(actor, critic,max_grad_norm=400,actor_lr=4e-3, critic_lr=4e-2, gamma=0.99)
runner = Runner(env)

In [None]:
steps_on_memory = 8000
episodes = 100000
episode_length = 53
total_steps = (episode_length*episodes)//steps_on_memory

for i in range(total_steps):
    memory = runner.run(steps_on_memory)
    learner.learn(memory, runner.steps, discount_rewards=True)

episode: 50 , episode reward: -24433238.960680235
episode: 100 , episode reward: -15943621.307845956
episode: 150 , episode reward: -20389502.511745665
episode: 200 , episode reward: -15904729.367346194
episode: 250 , episode reward: -16466849.883225795
episode: 300 , episode reward: -15327229.048799472
episode: 350 , episode reward: -13868919.470234677
episode: 400 , episode reward: -11857518.593678057
episode: 450 , episode reward: -11968761.41412711
episode: 500 , episode reward: -14597731.188335042
episode: 550 , episode reward: -14359862.480465943
episode: 600 , episode reward: -13693554.854540357
episode: 650 , episode reward: -7504210.543608316
episode: 700 , episode reward: -17247413.093813695
episode: 750 , episode reward: -14392002.246941712
episode: 800 , episode reward: -10631525.639631767
episode: 850 , episode reward: -7956170.701868766
episode: 900 , episode reward: -11144965.711485121
episode: 950 , episode reward: -15490820.02579205
episode: 1000 , episode reward: -107

episode: 8100 , episode reward: -9248023.9359695
episode: 8150 , episode reward: -3496166.39072299
episode: 8200 , episode reward: -9872031.268796831
episode: 8250 , episode reward: -6929603.62350058
episode: 8300 , episode reward: -13860732.356726613
episode: 8350 , episode reward: -5022033.015546044
episode: 8400 , episode reward: -8306800.783045085
episode: 8450 , episode reward: -7665259.192755082
episode: 8500 , episode reward: -7166881.201509306
episode: 8550 , episode reward: -6047876.608419925
episode: 8600 , episode reward: -14512177.322511178
episode: 8650 , episode reward: -5173809.519209393
episode: 8700 , episode reward: -5883569.844243121
episode: 8750 , episode reward: -15701349.170889378
episode: 8800 , episode reward: -10361670.425032979
episode: 8850 , episode reward: -4306108.252008416
episode: 8900 , episode reward: -10185382.884643227
episode: 8950 , episode reward: -12453285.501542632
episode: 9000 , episode reward: -9921918.574267808
episode: 9050 , episode rewar

episode: 16000 , episode reward: -9656725.744529953
episode: 16050 , episode reward: -1968849.0674479078
episode: 16100 , episode reward: -6228282.3971499195
episode: 16150 , episode reward: -943913.618873114
episode: 16200 , episode reward: -3716435.726165846
episode: 16250 , episode reward: -5274298.721924506
episode: 16300 , episode reward: 78094.82387590385
episode: 16350 , episode reward: -6955311.413567347
episode: 16400 , episode reward: -7096286.263290383
episode: 16450 , episode reward: -1965305.4826220386
episode: 16500 , episode reward: -5178971.690951781
episode: 16550 , episode reward: -4403447.452091985
episode: 16600 , episode reward: -10052553.197239986
episode: 16650 , episode reward: -7856922.384434719
episode: 16700 , episode reward: -7068648.290231143
episode: 16750 , episode reward: -5184537.605912433
episode: 16800 , episode reward: -8194325.148238135
episode: 16850 , episode reward: -3856750.4765754207
episode: 16900 , episode reward: -3948430.6855367813
episode:

episode: 23900 , episode reward: -3255762.643772308
episode: 23950 , episode reward: 1003553.891619347
episode: 24000 , episode reward: 1187399.029482032
episode: 24050 , episode reward: 3766928.119181199
episode: 24100 , episode reward: 4338247.82322317
episode: 24150 , episode reward: 1761587.303926139
episode: 24200 , episode reward: -5078421.675841009
episode: 24250 , episode reward: -3520490.801459039
episode: 24300 , episode reward: -3824053.03169349
episode: 24350 , episode reward: 1925501.8438286695
episode: 24400 , episode reward: -4320344.84312094
episode: 24450 , episode reward: -9406747.85279297
episode: 24500 , episode reward: -910578.1940548634
episode: 24550 , episode reward: 3483073.478099292
episode: 24600 , episode reward: -3191728.819245612
episode: 24650 , episode reward: -676012.0317678486
episode: 24700 , episode reward: 2632010.994993346
episode: 24750 , episode reward: -5493498.991890058
episode: 24800 , episode reward: -105745.50478827604
episode: 24850 , episo

In [None]:
# evaluate
seed = 12315
state = None
reward = 0
done = False
state = env.reset(seed=seed)
total_reward = 0
while not done:
    dists = actor(t(state))
    value = critic(t(state))
    actions = dists.sample().detach().data.numpy()
    actions_clipped = np.clip(actions, env.action_space.low.min(), env.action_space.high.max())
    print(actions, actions_clipped)
    
    state, reward, done, info = env.step(actions_clipped)
    print(f"Value {value}, reward {reward}")

    total_reward+=reward
print(total_reward)

In [None]:
env.render(extended=True)

In [None]:
state

In [None]:
#### test

In [None]:
env = gym.make("Pendulum-v1")
writer = SummaryWriter("runs/mish_activation",purge_step=0)

# config
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
actor = Actor(state_dim, n_actions, activation=Mish, hidden_size = 64)
critic = Critic(state_dim, activation=Mish, hidden_size = 64)

learner = A2CLearner(actor, critic)
runner = Runner(env)

In [None]:
steps_on_memory = 16
episodes = 800
episode_length = 200
total_steps = (episode_length*episodes)//steps_on_memory

for i in range(total_steps):
    memory = runner.run(steps_on_memory)
    learner.learn(memory, runner.steps, discount_rewards=False)