In [None]:
%tensorflow_version 1.x
!pip install stable-baselines[mpi]==2.10.0
!pip install keras-rl2

from gym import Env
from gym.spaces import Discrete,Box
import numpy as np
from typing import Optional
import random
import matplotlib.pyplot as plt

# model parameters

MAX_ALLOWED_WORKER = 10
BUDGET = 200
PLANTS = 50

WAGE_BEG = 15
WAGE_INT = 20
WAGE_ADV = 30

PRODUCTIVITY_BEG = 5
PRODUCTIVITY_INT = 7
PRODUCTIVITY_ADV = 10 

HIRE_COST = 20
FIRE_COST = 25
PRUNE_LENGTH = 10


PRUNE_PROFIT = 3
WORKER_AVAILABILITY_BEG = 10
WORKER_AVAILABILITY_INT =  8
WORKER_AVAILABILITY_ADV =  7


action_size = ((MAX_ALLOWED_WORKER + 1) * (MAX_ALLOWED_WORKER + 1)) ** 3

alpha = 0.5
beta = 0.5
M = 5

class FarmEnv(Env):
    def __init__(self):
      super().__init__()
      # 0: False 1: True
      # state = (number of hired workers_beg ,number of hired workers_med, number of hired workers_adv , Ran out of budget? , remaining plants)
      self.state = np.array([0, 0, 0, 0, PLANTS])
      self.action_space = Discrete(((MAX_ALLOWED_WORKER + 1) * (MAX_ALLOWED_WORKER + 1))**3)
      self.observation_space = Box(low = np.array([0,0,0,0,0]), high = np.array([MAX_ALLOWED_WORKER, MAX_ALLOWED_WORKER, MAX_ALLOWED_WORKER, 1, PLANTS]))
      self.prune_len = PRUNE_LENGTH
      self.budget = BUDGET

    def step(self, action):
    
      print('###')
      done = False
      info = {}

      mapping = tuple(np.ndindex((MAX_ALLOWED_WORKER + 1, MAX_ALLOWED_WORKER + 1,MAX_ALLOWED_WORKER + 1,MAX_ALLOWED_WORKER + 1,MAX_ALLOWED_WORKER + 1,MAX_ALLOWED_WORKER + 1)))
      new_action = mapping[action]

      m_b_t_1 = self.state[0]
      m_i_t_1 = self.state[1]
      m_a_t_1 = self.state[2]
      out_of_b = self.state[3]
      p_t = self.state[4]

      h_b_t = new_action[0]
      f_b_t = new_action[1]
      h_i_t = new_action[2]
      f_i_t = new_action[3]
      h_a_t = new_action[4]
      f_a_t = new_action[5]


      if (m_b_t_1 + m_i_t_1 + m_a_t_1) + (h_b_t + h_i_t + h_a_t) > MAX_ALLOWED_WORKER:
        #print('I cannot hire this many workers!')
        r_t = -M * beta
        return self.state, r_t, done, info 
      
      if (m_b_t_1 - f_b_t < 0) or (m_i_t_1 - f_i_t < 0) or (m_a_t_1 - f_a_t < 0):
        #print('I cannot fire this many workers!')
        r_t = -M * beta
        return self.state, r_t, done, info

      m_b_t = m_b_t_1 + h_b_t - f_b_t
      m_i_t = m_i_t_1 + h_i_t - f_i_t
      m_a_t = m_a_t_1 + h_a_t - f_a_t

      c_hire = HIRE_COST * (h_b_t + h_i_t + h_a_t)
      c_fire = FIRE_COST * (f_b_t + f_i_t + f_a_t)
      c_wage = (WAGE_BEG * m_b_t) + (WAGE_INT * m_i_t) + (WAGE_ADV * m_a_t)

      c_t = c_hire + c_fire + c_wage

      pl_b_t = m_b_t * round(np.random.normal(PRODUCTIVITY_BEG, 1))
      pl_i_t = m_i_t * round(np.random.normal(PRODUCTIVITY_INT, 1))
      pl_a_t = m_a_t * round(np.random.normal(PRODUCTIVITY_ADV, 1))
      pl_t = pl_b_t + pl_i_t + pl_a_t

      if pl_t > p_t:
        x = pl_t
        pl_t = p_t
        c_hire = HIRE_COST * (h_b_t + h_i_t + h_a_t)
        c_fire = FIRE_COST * (f_b_t + f_i_t + f_a_t)
        c_wage = ((WAGE_BEG * m_b_t) + (WAGE_INT * m_i_t) + (WAGE_ADV * m_a_t)) * (pl_t/ x)
        c_t = c_hire + c_fire + c_wage

        if c_t > self.budget:
          print('I pruned more than available but exceeded the budget!')
          r_t = -M * beta
          done = True
          return self.state, r_t, done, info
        else:
          self.state = [m_b_t,m_i_t,m_a_t, 0, 0]
          self.budget = self.budget - c_t
          print('********************************** I reached the goal ************************************************')
          r_t = alpha * ((PLANTS - p_t + pl_t) / PLANTS) * M * 1000
          done = True
          return self.state, r_t, done, info

      if c_t > self.budget:
        #print('I exceeded the budget!')
        r_t = -M * beta
        done = True
        return self.state, r_t, done, info
      
      if self.prune_len <= 0:
        print('I reached end of season! ')
        r_t = alpha * ((PLANTS - p_t + pl_t) / PLANTS)
        self.state = [m_b_t,m_i_t,m_a_t, 0, 0]
        self.budget = self.budget - c_t
        done = True
        return self.state, r_t, done, info
      
      r_t = (alpha * pl_t / PLANTS) - (beta * c_t / BUDGET)
      print('I am still making decision')
      self.state = [m_b_t,m_i_t,m_a_t, 0, 0]
      self.prune_len -= 1
      self.budget = self.budget - c_t
      
      return self.state, r_t, done, info

    def render(self):
        pass

    def reset(self):
        # self.state = np.asarray([random.randint(0, MAX_ALLOWED_WORKER), random.randint(0, BUDGET),
        #                           random.randint(0, PLANTS)])
        self.state = np.asarray([0,0,0,0,PLANTS])
        self.prune_len = PRUNE_LENGTH
        self.budget = BUDGET
        return self.state


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
env = FarmEnv()
actions = env.action_space.n
states = env.observation_space.shape
random_action = env.action_space.sample()
random_action



133114

In [None]:
from stable_baselines.common.callbacks import BaseCallback
from stable_baselines import results_plotter
from stable_baselines.results_plotter import load_results, ts2xy

class SaveOnBestTrainingRewardCallback(BaseCallback):

    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True

In [None]:
from stable_baselines.common.env_checker import check_env
from stable_baselines import DQN, PPO2, A2C, ACKTR
import os
from stable_baselines.bench import Monitor

# making sure the environment is correct based on Gym definition. 
print(check_env(env, warn=True))

# traking rewards
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
env = Monitor(env, log_dir)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

###
###
###
###
###
###
###
###
###
###
###
None


In [None]:
from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
time_steps = 30000
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=1.4e+03)
eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False, callback_on_new_best=callback_on_best)
model = DQN('MlpPolicy', env, exploration_fraction=0.5, exploration_final_eps=0.02, exploration_initial_eps=1.0,verbose=1, tensorboard_log="./dqn_farm/")


model.learn(total_timesteps=time_steps)


###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
I pruned more than available but exceeded the budget!

###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
###
#