In [1]:
import mesa
import seaborn as sns
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
import statistics

In [None]:
# Development of an agent-based simulation model in combination with reinforcement learning in Python using Mesa library



# - At the beginning of an episode, 10 plants (as agents) are planted

# - Plants must grow for 10 days (steps) before they can be harvested.

# - Each plant has a 10% chance of dying every day.

# - A new (fresh) plant can be bought every day (cost $10) to be planted

# - The aim is to harvest 10 plants that each grew for 10 days. When the goal is reached, there is a reward of $20 per plant harvested and the episode ends

# - Each day of the episode costs $5

# - Reinforcement learning is now used to find a strategy when to plant new trees that minimizes total costs.


# UPDATES

# Just 2 things:



# 1) May I ask you to include the possibility to run the simulation environment without the RL optimization. Following variables should be possible to be defined:
# - a daily probability for the decision variable to buy or not to buy.
# - number of episodes, the simulation will run
# At the end of each episode, I'd like to see a table with the days (horizontal) and the plants (vertical) showing the different states of each plant on each day.
# At the end of the entire simulation, I'd like to see a table showing important values of each episode, such as:
# - the total costs
# - the number of days needed



# 2) May I also ask you to precisely comment the code, in order I can follow and understand it?

In [94]:
class PlantAgent(mesa.Agent):

    def __init__(self, unique_id, model):
        super().__init__(unique_id, model)

        self.alive = True
        self.age = 0
        self.harvested = False

    def step(self):
        if self.alive and self.age != 10:
            self.age += 1

        if self.age == 10:
            self.harvested = True


        if not self.alive or self.harvested:
            return


        death_outcome = random.choices([False, True], weights=[0.1, 0.9], k=1)[0]
        self.alive = death_outcome


In [416]:
class PlantModel(mesa.Model):

    def __init__(self, N, render_mode="not human"):
        super().__init__()

        self.num_agents = N
        self.number_of_days = 0
        self.number_of_plants_harvested = 0

        self.complete_data = []
        self.total_cost = 0
        self.render_mode = render_mode

        self.schedule = mesa.time.RandomActivation(self)

        for i in range(self.num_agents):
            a = PlantAgent(i, self)
            self.schedule.add(a)

    def get_state(self):
        agents = [a for a in self.schedule.agents]
        state = []

        number_of_plants_harvested = 0

        for a in agents:

            if a.alive and not a.harvested:
                state.append(a.age)

            if a.harvested:
                number_of_plants_harvested += 1

        return number_of_plants_harvested, tuple(sorted(state))

    def create_plant_data_string(self, plant_data):
        plant_data_string = ""

        if plant_data[1] == True:
            plant_data_string += "Alive"
        else:
            plant_data_string += "Dead"

        plant_data_string += "   Age:"
        plant_data_string += str(plant_data[2])
        plant_data_string += "   "

        if plant_data[3] == True:
            plant_data_string += "Harvested"
        else:
            plant_data_string += "Not Harvested"

        return plant_data_string

    def get_agent_data(self):

        agents = [a for a in self.schedule.agents]

        agent_data = []

        for a in agents:

            agent_data.append((a.unique_id, a.alive, a.age, a.harvested))

        sorted_agent_data = sorted(agent_data, key=lambda x: x[0])

        return sorted_agent_data

    def prepare_data(self):
        agent_data = self.get_agent_data()

        data = [self.create_plant_data_string(d) for d in agent_data]
        return data

    def transform_complete_data(self):
        columns = ["Day " + str(i+1) for i in range(self.number_of_days)]


        df = pd.DataFrame(np.array(self.complete_data).T, columns=columns)
        return df

    def check_terminated(self, observation):
        return list(map(lambda x: x[1] == 10, observation)).count(True)

    def step(self, action):


        self.number_of_days += 1
        terminated = 0
        truncated = False
        reward = -5
        info = []

        if action == 1:
            self.num_agents += 1
            a = PlantAgent(self.num_agents - 1, self)
            self.schedule.add(a)

            reward -= 10

            if self.render_mode == "human":
                for d in self.complete_data:
                    d.append("Not Available")

        self.schedule.step()

        observation = self.get_state()

        difference = observation[0] - self.number_of_plants_harvested
        reward += difference * 20

        self.number_of_plants_harvested = observation[0]

        if self.number_of_days == 500:
            truncated = True

        if self.number_of_plants_harvested == 10:
            terminated = 1

        if self.render_mode == "human":

            current_data = self.prepare_data()
            self.complete_data.append(current_data)

            info = self.transform_complete_data()

            self.total_cost +=  -1* reward

        return observation, reward, terminated, truncated, info

In [431]:
model = PlantModel(10, render_mode="human")

In [370]:
def probabilistic_action():
    BUY_PROB = 0.9
    NOT_BUY_PROB = 0.1
    action = random.choices([0, 1], weights=[NOT_BUY_PROB, BUY_PROB], k=1)[0]
    return action

In [451]:
action = probabilistic_action()
observation, reward, terminated, truncated, info = model.step(action)

if terminated or truncated:
    print("Done")
    print("Total Cost is ", model.total_cost)
    print("Total number of days is ", model.number_of_days)

Done
Total Cost is  100
Total number of days is  20


Thera are a few things to consider

Terminal State: This would be achieved when we have 10 plants harvested.  
Truncation: The environment would truncate in 500 steps.  
Reward: Positive for harvesting and negative for buying a plant and on every step.  
Observation: This would be the current state. The state would be the number of plants harvested and the age of each plant.  
Info: This is the complete data. It includes the number of plants on the vertical and the number of days on the horizontal.  

In [4]:
# DO NOT RUN THIS - THIS is JUST TO ESTIMATE THE STATE SPACE

def get_state_space(number_of_iterations = 100000):

    state_space = set()

    truncated = 0
    model = PlantModel(10)
    initial_state = model.get_state()
    state_space.add(initial_state)


    for _ in tqdm(range(number_of_iterations)):

        model = PlantModel(10)
        observation = []
        terminated = 0



        while not terminated and not truncated:

            action = random.randint(0, 1)
            observation, _, terminated, truncated = model.step(action)
            state_space.add(observation)

    state_space = list(state_space)
    state_space = sorted(state_space, key=lambda x: (x[0], x[1]))

    return state_space

In [18]:
# DO NOT RUN
# state_space = get_state_space(10)

In [19]:
# DO NOT RUN THIS - THIS IS JUST TO STORE THE STATE SPACE


# with open("state_space.pkl", "wb") as file:
#     pickle.dump(state_space, file)

In [5]:
with open("state_space.pkl", "rb") as file:
    state_space = pickle.load(file)

In [409]:
def initialize_Q_Table(state_space):
    Q_Table = {}

    for s in state_space:
        Q_Table[s] = [0, 0]

    return Q_Table

In [410]:
Q_Table = initialize_Q_Table(state_space)

In [411]:
def generate_trajectory(policy):

    trajectory = []

    state = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
    terminated = 0
    truncated = 0
    next_state = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

    model = PlantModel(10)

    while not terminated and not truncated:
        state = next_state
        action = policy[state]

        observation, reward, terminated, truncated, _ = model.step(action)

        next_state = observation
        experience_tuple = (state, action, reward, next_state)

        trajectory.append(experience_tuple)

    return trajectory

In [482]:
class QLearning:
    def __init__(self, Q_Table, gamma=1, number_of_iterations=100000):
        self.number_of_iterations = number_of_iterations

        self.Q = Q_Table

        self.ε = self.get_parameters_exponential_decay(decay_rate=0.999999)
        self.α = self.get_parameters_exponential_decay(decay_rate=0.999999)

        self.trajectories = [[]]

    def epsilon_greedy_exponential(self, iteration, s):
        ε = self.ε[iteration]

        a = 0

        if s not in self.Q:
            self.Q[s] = [0, 0]

        if np.random.random() > ε:
            a = np.argmax(self.Q[s])
        else:
            a = np.random.randint(len(self.Q[s]))

        return a

    def get_parameters_exponential_decay(
        self, initial_value=1, min_value=0.01, decay_rate=0.99
    ):
        num_points = self.number_of_iterations

        exponential_decay_parameters = initial_value * (
            decay_rate ** np.arange(num_points)
        )
        exponential_decay_parameters = np.where(
            exponential_decay_parameters < min_value,
            min_value,
            exponential_decay_parameters,
        )

        return exponential_decay_parameters

    def do_one_qlearning_iteration(self, iteration, γ=0.99):

        s = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
        terminated = 0
        truncated = 0
        s_prime = (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

        trajectory = []

        model = PlantModel(10)
        while not terminated and not truncated:
            a = self.epsilon_greedy_exponential(iteration, s)
            s_prime, R, terminated, truncated, _ = model.step(a)

            if s_prime not in self.Q:
                self.Q[s_prime] = [0, 0]

            self.Q[s][a] = self.Q[s][a] + self.α[iteration] * (
                R + γ * max(self.Q[s_prime]) - self.Q[s][a]
            )

            trajectory.append((s, a, R, s_prime))

            s = s_prime

        return trajectory

    def do_qlearning(self):
        for i in tqdm(range(2908024, self.number_of_iterations)):
            trajectory = self.do_one_qlearning_iteration(i)
            self.trajectories.append(trajectory)

In [483]:
QL = QLearning(Q_Table, number_of_iterations=5000000)

In [489]:
2259264 + 648760

2908024

In [485]:
QL.do_qlearning()

 24%|██▎       | 648760/2740736 [2:09:35<2:07:31, 273.40it/s]  Exception ignored in: <generator object tqdm.__iter__ at 0x0000020BF55962B0>
Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\tqdm\std.py", line 1197, in __iter__
    self.close()
  File "c:\Python311\Lib\site-packages\tqdm\std.py", line 1291, in close
    fp_write('')
  File "c:\Python311\Lib\site-packages\tqdm\std.py", line 1288, in fp_write
    self.fp.write(str(s))
  File "c:\Python311\Lib\site-packages\tqdm\utils.py", line 195, in inner
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MuhammadMahenMughal\AppData\Roaming\Python\Python311\site-packages\ipykernel\iostream.py", line 648, in write
    self._schedule_flush()
  File "C:\Users\MuhammadMahenMughal\AppData\Roaming\Python\Python311\site-packages\ipykernel\iostream.py", line 545, in _schedule_flush
    self.pub_thread.schedule(_schedule_in_thread)
  File "C:\Users\MuhammadMahenMughal\AppData\Roaming\Python

KeyboardInterrupt: 

In [486]:
Q_Table = QL.Q

In [487]:
Q_Table

{(0, ()): [-245.71482983775334, -242.9745475179352],
 (0, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)): [-117.03617003244456,
  -113.6425710957123],
 (0, (1,)): [-232.79287439648792, -228.76114515884774],
 (0, (1, 1, 1, 1)): [-188.31349309056185, -186.8459798020303],
 (0, (1, 1, 1, 1, 1)): [-178.29506727455473, -169.43860187566477],
 (0, (1, 1, 1, 1, 1, 1)): [-164.75310544896018, -156.88082326463535],
 (0, (1, 1, 1, 1, 1, 1, 1)): [-145.96252394506064, -138.34575239948933],
 (0, (1, 1, 1, 1, 1, 1, 1, 1)): [-133.7046583716885, -121.94206618138641],
 (0, (1, 1, 1, 1, 1, 1, 1, 1, 1)): [-117.25071599504007, -113.81263191128804],
 (0, (1, 1, 1, 1, 1, 1, 1, 1, 1, 1)): [-97.97940058744095, -95.01597526131921],
 (0, (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)): [-88.88665889462081,
  -79.85652026027701],
 (0, (1, 2)): [-219.52412454246797, -213.2999258325612],
 (0, (1, 2, 2)): [-184.48973227095945, -184.59858665844453],
 (0, (1, 2, 2, 2)): [-186.14620541336, -181.53489243179644],
 (0, (1, 2, 2, 2, 2)): [-172.69075503

In [488]:
# DO NOT RUN THIS CELL
with open("Q_Table2.pkl", "wb") as file:
    pickle.dump(Q_Table, file)

In [10]:
with open("Q_Table.pkl", "rb") as file:
    loaded_Q_Table = pickle.load(file)

In [11]:
def get_policy(Q_Table):

    policy = {}

    for state in Q_Table.keys():
        optimal_action = np.argmax(loaded_Q_Table[state])
        policy[state] = optimal_action

    return policy

In [12]:
policy = get_policy(loaded_Q_Table)

In [24]:
# # DO NOT RUN THIS CELL
# with open("Policy.pkl", "wb") as file:
#     pickle.dump(policy, file)

In [25]:
with open("Policy.pkl", "rb") as file:
    policy = pickle.load(file)

In [5]:
def get_sum_of_reward_from_multiple_trajectories(policy, number_of_trajectories=1000):

    sum_of_rewards = []

    for _ in tqdm(range(number_of_trajectories)):

        trajectory = generate_trajectory(policy)

        rewards = list(map(lambda x: x[2], trajectory))
        sum_of_rewards.append(sum(rewards))

    return sum_of_rewards

In [21]:
def get_statistics_of_total_reward(policy):
    sum_of_rewards = get_sum_of_reward_from_multiple_trajectories(policy, 1000)
    mean_reward = statistics.mean(sum_of_rewards)
    std_dev = statistics.stdev(sum_of_rewards)

    return mean_reward, std_dev

In [23]:
get_statistics_of_total_reward(policy)

100%|██████████| 1000/1000 [00:02<00:00, 447.43it/s]


(-109.715, 107.80823897732209)