In [2]:
import numpy as np


In [3]:
class Agent:
    def __init__(self, agent_name, agent_index):
        self.index = agent_index
        self.agent = agent_name
        self.health = None
        self.isHit = False
        self.move = True
        self.movement_speed = 1.00
        self.previous_position = np.array([0, 0], dtype=np.float32)
        self.current_position = None
        self.same_position = False
        self.current_step = 0
        self.action = None
        pass

    def agent_action(self, action):

        pass

    def agent_reset(self, width, height):
        padding = 30
        self.current_position = np.array(
            [np.random.uniform(30, width - padding), np.random.uniform(30, height - padding)], dtype=np.float32)

    def step_update(self, action, range_x, range_y):

        if action == 0:
            self.current_position[0] -= self.movement_speed
        elif action == 1:
            self.current_position[0] += self.movement_speed
        elif action == 2:
            self.current_position[1] -= self.movement_speed
        elif action == 3:
            self.current_position[1] += self.movement_speed
        
        self.current_position[0] = np.clip(self.current_position[0], 0, range_x)
        self.current_position[1] = np.clip(self.current_position[1], 0, range_y)

In [4]:
from gymnasium.spaces import Discrete, Box, MultiDiscrete
from gymnasium import Env
import numpy as np
import pygame
import time

In [15]:
env.close()

In [41]:
start = time.time()
time.sleep(1)
end = time.time()

print(end - start)

1.0012321472167969


In [13]:
class GameEnv(Env):
    def __init__(self, screen_width=400, screen_height=400, render_mode='human'):
        super(GameEnv, self).__init__()

        # defining the screen dimension for render purpose
        self.screen_width = screen_width
        self.screen_height = screen_height
        self.render_mode = render_mode

        # defining the observation and action spaces for all the agents
        
        self.observation_space = Box(low=np.array([0, 0, 0, 0], dtype=np.float32),
                                    high=np.array([self.screen_width, self.screen_height, self.screen_width, self.screen_height], dtype=np.float32),
                                    dtype=np.float32)

        # the pygame window should be initialized in the render function

        # setting the total number of agent
        
        self.number_of_prey = 1
        self.number_of_predator = 1
        self.prey_agent = None
        self.predator_agent = None
        self.predator_i_position = None
        self.initial_distance = None
        self.current_distance = None
        self.predator_total_reward = 0
        self.number_of_agents = self.number_of_prey + self.number_of_prey

        # defining the action space based on total number of predator and prey
        self.action_space = Discrete(4)


        # setting the total number of obstacles
        self.total_obstacles = None

        # keeping a counter to save the total steps
        self.total_steps = 0

        # initializing the pygame
        pygame.init()

        # setting the screen size
        self.screen = pygame.display.set_mode((self.screen_width, self.screen_height))
        pygame.display.set_caption('Multi Agent Environment(simple)')
        
        # keep the track of time of the rendering
        self.clock = pygame.time.Clock()

        # *it is set in milisec format (time is sec is time/1000)
        self.total_running_time = 10

        # start the tick timer
        self.start_time = 0
        # print(f'start  time: {self.start_time}')

        # initializing the font
        pygame.font.init()
        self.font = pygame.font.Font(None, 18)

    # this function rerturns the value of the action into 2 digits 
    # if the action_space.sample() gives 1 digit number
    # * if  the number is 3 it will return 03 
    # * if  the number is 14 then it will return 14
    def expand_action_digit(self, action):

        # this basically checks the number if it has 1 then fills the rest with 0
        # if the number is 2 digits then it stays the same
        action = str(action).zfill(2)
        prey_action = int(action[0]) % 4
        predator_action = int(action[1]) % 4
        return prey_action, predator_action
        

    # this method will initialize the number of agents
    # ! this must be called from outside
    def agent_init(self):

        prey_agents = Agent('prey', 0)

        predator_agents = Agent('predator', 0)

        self.prey_agent = prey_agents
        self.predator_agent = predator_agents
        

    # this function is used to explicitly set the number of agents
    # ! this needs to be called from outside
    def set_agent_number(self, prey_number, predator_number):
        self.number_of_predator = predator_number
        self.number_of_prey = prey_number

    # the usual reset function
    def reset(self, seed=0):
        self.start_time = time.time()
        
        self.agent_init()

        self.total_steps = 0
        self.predator_total_reward = 0

        prey = self.prey_agent
        predator = self.predator_agent

        # for prey in self.prey_agents:
        prey.agent_reset(width=self.screen_width, height=self.screen_height)
        # observation.append([prey.index, prey.agent, prey.current_position])

        # for predator in self.predator_agents:
        predator.agent_reset(width=self.screen_width, height=self.screen_height)
        # observation.append([predator.index, predator.agent, predator.current_position])
        
        # setting the predator and prey to their initial position
        self.prey_agent = prey
        self.predator_agent = predator

        # setting the initial position of predator for reward
        self.predator_i_position = self.predator_agent.current_position
        
        # calculating the initial distance of 2 agents
        direction = self.predator_agent.current_position - self.prey_agent.current_position
        self.initial_distance = np.linalg.norm(direction)

        # observation :
        # all the variable values inside the obsercation space needs to be sent inside the observation variable
        observation = np.concatenate([self.prey_agent.current_position, self.predator_agent.current_position])
        return observation, seed

    # the step function
    # this function is called for every timesteps
    # this function updates the actions or states of agents in the env
    # this function is called default by the algorithms of all sorts
    # * it returns observation, reward, done, truncated, info
    # * any game policy change can be done here
    # * reward must be set here
    def step(self, action):
        
        # initializing the return variables
        done = False
        reward = 0
        truncated = False
        info = {}
        current_time = time.time()
        # print(f'current time: {current_time}')
        # when ever the step is starting set the start time
        # if self.total_steps == 0:
        #     self.start_time = pygame.time.get_ticks()
        # else:
        #     current_time = pygame.time.get_ticks()

        elapsed_time = current_time - self.start_time
        # print(f'elapsed time: {elapsed_time}')
        # handles the pygame window event when closing
        # !if the window still crashes pygame.event needs to be managed properly
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                done = True
                pygame.quit()
        
        prey_action, predator_action = action
        # print(f'action: {action}')
        # *the actions are split as required
        # prey_action, predator_action = self.expand_action_digit(action)
        # predator_action = action

        self.prey_agent.movement_speed = 2
        prey = self.prey_agent
        prey.step_update(action=prey_action, range_x=self.screen_width - 10, range_y=self.screen_height - 10)
        
        predator = self.predator_agent
        # print(f'prey: {prey_action}, predator: {predator_action}')
                

        # print(f'predator_{predator.index} = action:{action} current_position: {predator.current_position}')
        predator.step_update(action=predator_action, range_x=self.screen_width - 10, range_y=self.screen_height - 10)
        # print(f'predator_{predator.index}: new_position: {predator.current_position}')
            
        # !observation.append({'index': predator.index, 'name': predator.agent, 'position': predator.current_position})
        # observation = self.predator_agent.current_position
        observation = np.concatenate([self.prey_agent.current_position, self.predator_agent.current_position])

        # print(f'observation: {self.predator_agent.current_position}')
        self.total_steps += 1

        direction = self.predator_agent.current_position - self.prey_agent.current_position

        # Calculate the distance between the centers of the two dots
        distance_between_centers = np.linalg.norm(direction)
        self.current_distance = distance_between_centers

        # check everystep if the distance of two agents are:
        # greater than  initial distance?  
        # yes: -reward
        # no: +reward

        # if distance_between_centers > self.initial_distance:
        #     reward -= 0.06
        # else:
        #     reward += 0.01

        reward = 2 * np.exp(-0.04 * (distance_between_centers - 10)) - 0.001
        
        # if distance_between_centers > self.initial_distance:
        #     reward -= 2 * np.exp(-distance_between_centers * 0.5) - 0.5
        # else:
        #     reward += 2 * np.exp(-distance_between_centers * 0.5) - 0.5
            

        # Check if there is a collision (distance <= sum of radii)
        if elapsed_time <= self.total_running_time:
            if distance_between_centers <= 20:
                reward += 40
                done = True
                    # pygame.quit()
                    # self.close()
            # if self.total_steps == 30000:
            #     done = True
        else:
            done = True
            reward -= 20
                # pygame.quit()
                # self.close()

        # print(self.total_steps)
        self.render()
        # it will update the total reward everystep
        self.predator_total_reward = reward

        return observation, reward, done, _, info
        

    def render(self):
        if self.render_mode == 'human':
            screen = self.screen

            # clear screen
            screen.fill((255, 255, 255))
            prey = self.prey_agent
            pos_x, pos_y = prey.current_position
            prey_radius = 10
            pygame.draw.circle(screen, (0, 0, 255), (int(pos_x), int(pos_y)), prey_radius)

            predator = self.predator_agent
            pos_x, pos_y = predator.current_position
            predator_radius = 10

            pygame.draw.circle(screen, (255, 0, 0), (int(pos_x), int(pos_y)), predator_radius)

            text_surface = self.font.render(f"Reward: {self.predator_total_reward: .5f} initial distance: {self.initial_distance: .2f} current_distance:{self.current_distance: .2f}", True, (0, 0, 0))

            text_rect = text_surface.get_rect()

            text_rect.center = (self.screen_width - 200, 10)

            self.screen.blit(text_surface, text_rect)

            pygame.display.update()

    def close(self):
        pygame.quit()

In [43]:
env = GameEnv()
env.reset()
env.step(1)
env.render()

In [62]:
env.close()


In [9]:
model = PPO.load(baseline_path)

environment testing

In [63]:
env = GameEnv()
for i in range(0, 5):
    done = False

    # env.agent_init()
    env.reset()
    total_reward = 0
    while not done:
        action = env.action_space.sample()
        obs, reward, done, _, _ = env.step(action)
        total_reward += reward
    print(f'total reward: {total_reward}')
    print(f'Number of steps: {env.total_steps}')
        # env.render()

total reward: 2371.137709488271
Number of steps: 44837
total reward: 270.78877201832427
Number of steps: 48150
total reward: -24.856743040008954
Number of steps: 47807
total reward: 2788.3566122644024
Number of steps: 48706
total reward: -53.88667462468824
Number of steps: 36450


Different state entirely

In [16]:
env = GameEnv()
model.set_env(env)
for i in range(0, 20):
    done = False

    # env.agent_init()
    obs, _ = env.reset()
    total_reward = 0
    while not done:
        predator_action = model.predict(obs)
        prey_action = env.action_space.sample()
        action = [prey_action, predator_action[0]]
        obs, reward, done, _, _ = env.step(action)
        total_reward += reward
    print(f'total reward: {total_reward}')
    print(f'Number of steps: {env.total_steps}')

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
total reward: 136.5336338945745
Number of steps: 702
total reward: 256.76516026671976
Number of steps: 756
total reward: 200.29870578978216
Number of steps: 557
total reward: 114.55980908904057
Number of steps: 319
total reward: 142.41522866537312
Number of steps: 201
total reward: 127.4443911633665
Number of steps: 516
total reward: 226.4683703587203
Number of steps: 584
total reward: 180.3386732613892
Number of steps: 357
total reward: 243.08326077363853
Number of steps: 881
total reward: 292.28427531610976
Number of steps: 1440
total reward: 180.5108202291097
Number of steps: 746
total reward: 140.89895069722968
Number of steps: 586
total reward: 130.70734748396654
Number of steps: 245
total reward: 93.52842262016442
Number of steps: 358
total reward: 176.94984735459792
Number of steps: 432
total reward: 235.98976015613724
Number of steps: 451
total reward: 160.9417844398782
Number of steps: 272
total rewar

In [6]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os

In [47]:
# from stable_baselines3.common.callbacks import BaseCallback
# import os

# class SaveOnBestTrainingRewardCallback(BaseCallback):
#     def __init__(self, check_freq, log_dir, verbose=1):
#         super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
#         self.check_freq = check_freq  # How often to check for best reward
#         self.log_dir = log_dir  # Directory to save the best model
#         self.best_mean_reward = -float("inf")

#     def _init_callback(self) -> None:
#         # Create the log directory if it doesn't exist
#         os.makedirs(self.log_dir, exist_ok=True)

#     def _on_step(self) -> bool:
#         if self.n_calls % self.check_freq == 0:
#             # Evaluate the model's performance
#             mean_reward = self.eval_model()
#             if mean_reward > self.best_mean_reward:
#                 # If the current performance is better, save the model
#                 self.best_mean_reward = mean_reward
#                 self.model.save(os.path.join(self.log_dir, "best_model"))

#     def eval_model(self):
#         # Perform evaluation and return the mean reward
#         # You can adapt this part based on how you evaluate the model
#         # In this example, it assumes you have an environment and a trained model
#         mean_reward = 0
#         num_episodes = 10
#         for _ in range(num_episodes):
#             obs = self.eval_env.reset()
#             episode_reward = 0
#             done = False
#             while not done:
#                 action, _ = self.model.predict(obs, deterministic=True)
#                 obs, reward, done, _ = self.eval_env.step(action)
#                 episode_reward += reward
#             mean_reward += episode_reward
#         mean_reward /= num_episodes
#         return mean_reward


In [48]:
# from stable_baselines3.common.callbacks import EvalCallback
import os

log_dir = ".\Training\Logs"  # Change this to your desired directory
# os.makedirs(log_dir, exist_ok=True)

# # Create a callback to log data to TensorBoard
# callback = EvalCallback(
#     eval_env=env,
#     callback_on_new_best=log_dir,
#     n_eval_episodes=10,  # Adjust as needed
#     best_model_save_path=log_dir,
#     log_path=log_dir,
# )
os.makedirs(log_dir, exist_ok=True)

In [49]:
env.close()

In [50]:
env = GameEnv()

env.reset()

(array([219.15613,  76.15038, 141.71185, 129.18074], dtype=float32), 0)

In [7]:
log_path = os.path.join('Training', 'Logs')

In [52]:
env = GameEnv()

env.reset()
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=1000000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_14


-----------------------------
| time/              |      |
|    fps             | 707  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 587         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.019338187 |
|    clip_fraction        | 0.305       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | 0.909       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0542     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0244     |
|    value_loss           | 0.0014      |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1f415a188d0>

In [57]:
env.close()

In [8]:
baseline_path = os.path.join('Training', 'Models', 'new_rew_baseline')

In [54]:
model.save(baseline_path)

In [56]:
del model

NameError: name 'model' is not defined

In [59]:
model = PPO.load(baseline_path)
model.set_env(env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [58]:
env = GameEnv()

In [60]:
# env = GameEnv()
evaluate_policy(model, env, n_eval_episodes=10)



(5557.276460103627, 7704.656771243825)

In [142]:
print(mean_r)
print(mean_d)

-12.010203404847072
18.135100954954762


In [143]:
env.close()

In [None]:
def piecewise(x):
    k = 0.0957
    y = 0.0092 * np.exp(-k * (x - 10)) - 0.0002
    return y

In [35]:
%tensorboard

UsageError: Line magic function `%tensorboard` not found.
