In [1]:
import numpy as np

In [2]:
class Agent:
    def __init__(self, agent_name, agent_index):
        self.index = agent_index
        self.agent = agent_name
        self.health = None
        self.isHit = False
        self.move = True
        self.movement_speed = 1.00
        self.previous_position = np.array([0, 0], dtype=np.float32)
        self.current_position = None
        self.same_position = False
        self.current_step = 0
        self.action = None
        pass

    def agent_action(self, action):

        pass

    def agent_reset(self, width, height):
        padding = 30
        self.current_position = np.array(
            [np.random.uniform(30, width - padding), np.random.uniform(30, width - padding)], dtype=np.float32)

    def step_update(self, action, range_x, range_y):

        if action == 0:
            self.current_position[0] -= self.movement_speed
        elif action == 1:
            self.current_position[0] += self.movement_speed
        elif action == 2:
            self.current_position[1] -= self.movement_speed
        elif action == 3:
            self.current_position[1] += self.movement_speed
        
        self.current_position[0] = np.clip(self.current_position[0], 0, range_x)
        self.current_position[1] = np.clip(self.current_position[1], 0, range_y)

In [3]:

from gymnasium.spaces import Discrete, Box, MultiDiscrete
from gymnasium import Env
import numpy as np
import pygame
import time

In [127]:
start = time.time()
time.sleep(1)
end = time.time()

print(end - start)

1.0012104511260986


In [4]:
class GameEnv(Env):
    def __init__(self, screen_width=400, screen_height=400, render_mode='human'):
        super(GameEnv, self).__init__()

        # defining the screen dimension for render purpose
        self.screen_width = screen_width
        self.screen_height = screen_height
        self.render_mode = render_mode

        # defining the observation and action spaces for all the agents
        
        self.observation_space = Box(low=np.array([0, 0, 0, 0], dtype=np.float32),
                                    high=np.array([self.screen_width, self.screen_height, self.screen_width, self.screen_height], dtype=np.float32),
                                    dtype=np.float32)

        # the pygame window should be initialized in the render function

        # setting the total number of agent
        
        self.number_of_prey = 1
        self.number_of_predator = 1
        self.prey_agent = None
        self.predator_agent = None
        self.predator_i_position = None
        self.initial_distance = None
        self.current_distance = None
        self.predator_total_reward = 0
        self.number_of_agents = self.number_of_prey + self.number_of_prey

        # defining the action space based on total number of predator and prey
        self.action_space = Discrete(4)


        # setting the total number of obstacles
        self.total_obstacles = None

        # keeping a counter to save the total steps
        self.total_steps = 0

        # initializing the pygame
        pygame.init()

        # setting the screen size
        self.screen = pygame.display.set_mode((self.screen_width, self.screen_height))
        pygame.display.set_caption('Multi Agent Environment(simple)')
        
        # keep the track of time of the rendering
        self.clock = pygame.time.Clock()

        # *it is set in milisec format (time is sec is time/1000)
        self.total_running_time = 10

        # start the tick timer
        self.start_time = 0
        # print(f'start  time: {self.start_time}')

        # initializing the font
        pygame.font.init()
        self.font = pygame.font.Font(None, 18)

    # this function rerturns the value of the action into 2 digits 
    # if the action_space.sample() gives 1 digit number
    # * if  the number is 3 it will return 03 
    # * if  the number is 14 then it will return 14
    def expand_action_digit(self, action):

        # this basically checks the number if it has 1 then fills the rest with 0
        # if the number is 2 digits then it stays the same
        action = str(action).zfill(2)
        prey_action = int(action[0]) % 4
        predator_action = int(action[1]) % 4
        return prey_action, predator_action
        

    # this method will initialize the number of agents
    # ! this must be called from outside
    def agent_init(self):

        prey_agents = Agent('prey', 0)

        predator_agents = Agent('predator', 0)

        self.prey_agent = prey_agents
        self.predator_agent = predator_agents
        

    # this function is used to explicitly set the number of agents
    # ! this needs to be called from outside
    def set_agent_number(self, prey_number, predator_number):
        self.number_of_predator = predator_number
        self.number_of_prey = prey_number

    # the usual reset function
    def reset(self, seed=0):
        self.start_time = time.time()
        
        self.agent_init()

        self.total_steps = 0
        self.predator_total_reward = 0

        prey = self.prey_agent
        predator = self.predator_agent

        # for prey in self.prey_agents:
        prey.agent_reset(width=self.screen_width, height=self.screen_height)
        # observation.append([prey.index, prey.agent, prey.current_position])

        # for predator in self.predator_agents:
        predator.agent_reset(width=self.screen_width, height=self.screen_height)
        # observation.append([predator.index, predator.agent, predator.current_position])
        
        # setting the predator and prey to their initial position
        self.prey_agent = prey
        self.predator_agent = predator

        # setting the initial position of predator for reward
        self.predator_i_position = self.predator_agent.current_position
        
        # calculating the initial distance of 2 agents
        direction = self.predator_agent.current_position - self.prey_agent.current_position
        self.initial_distance = np.linalg.norm(direction)

        # observation :
        # all the variable values inside the obsercation space needs to be sent inside the observation variable
        observation = np.concatenate([self.prey_agent.current_position, self.predator_agent.current_position])
        return observation, seed

    # the step function
    # this function is called for every timesteps
    # this function updates the actions or states of agents in the env
    # this function is called default by the algorithms of all sorts
    # * it returns observation, reward, done, truncated, info
    # * any game policy change can be done here
    # * reward must be set here
    def step(self, action):
        
        # initializing the return variables
        done = False
        reward = 0
        truncated = False
        info = {}
        current_time = time.time()
        # print(f'current time: {current_time}')
        # when ever the step is starting set the start time
        # if self.total_steps == 0:
        #     self.start_time = pygame.time.get_ticks()
        # else:
        #     current_time = pygame.time.get_ticks()

        elapsed_time = current_time - self.start_time
        # print(f'elapsed time: {elapsed_time}')
        # handles the pygame window event when closing
        # !if the window still crashes pygame.event needs to be managed properly
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                done = True
                pygame.quit()
        
        # print(f'action: {action}')
        # *the actions are split as required
        # prey_action, predator_action = self.expand_action_digit(action)
        predator_action = action
        predator = self.predator_agent
        # print(f'prey: {prey_action}, predator: {predator_action}')
                

        # print(f'predator_{predator.index} = action:{action} current_position: {predator.current_position}')
        predator.step_update(action=predator_action, range_x=self.screen_width - 10, range_y=self.screen_height - 10)
        # print(f'predator_{predator.index}: new_position: {predator.current_position}')
            
        # !observation.append({'index': predator.index, 'name': predator.agent, 'position': predator.current_position})
        # observation = self.predator_agent.current_position
        observation = np.concatenate([self.prey_agent.current_position, self.predator_agent.current_position])

        # print(f'observation: {self.predator_agent.current_position}')
        self.total_steps += 1

        direction = self.predator_agent.current_position - self.prey_agent.current_position

        # Calculate the distance between the centers of the two dots
        distance_between_centers = np.linalg.norm(direction)
        self.current_distance = distance_between_centers

        # check everystep if the distance of two agents are:
        # greater than  initial distance?  
        # yes: -reward
        # no: +reward

        if distance_between_centers > self.initial_distance:
            reward -= 0.06
        else:
            reward += 0.01
            
        # Check if there is a collision (distance <= sum of radii)
        if elapsed_time <= self.total_running_time:
            if distance_between_centers <= 20:
                reward += 40
                done = True
                    # pygame.quit()
                    # self.close()
            # if self.total_steps == 30000:
            #     done = True
        else:
            done = True
            reward -= 20
                # pygame.quit()
                # self.close()

        # print(self.total_steps)
        self.render()
        # it will update the total reward everystep
        self.predator_total_reward = reward

        return observation, reward, done, _, info
        

    def render(self):
        if self.render_mode == 'human':
            screen = self.screen

            # clear screen
            screen.fill((255, 255, 255))
            prey = self.prey_agent
            pos_x, pos_y = prey.current_position
            prey_radius = 10
            pygame.draw.circle(screen, (0, 0, 255), (int(pos_x), int(pos_y)), prey_radius)

            predator = self.predator_agent
            pos_x, pos_y = predator.current_position
            predator_radius = 10

            pygame.draw.circle(screen, (255, 0, 0), (int(pos_x), int(pos_y)), predator_radius)

            text_surface = self.font.render(f"Reward: {self.predator_total_reward: .5f} initial distance: {self.initial_distance: .2f} current_distance:{self.current_distance: .2f}", True, (0, 0, 0))

            text_rect = text_surface.get_rect()

            text_rect.center = (self.screen_width - 200, 10)

            self.screen.blit(text_surface, text_rect)

            pygame.display.update()

    def close(self):
        pygame.quit()


In [87]:
env = GameEnv()
env.reset()
env.step(1)
env.render()

In [13]:
env.close()

Testing the environment

In [14]:
env = GameEnv()
for i in range(0, 5):
    done = False

    # env.agent_init()
    env.reset()
    total_reward = 0
    while not done:
        action = env.action_space.sample()
        obs, reward, done, _, _ = env.step(action)
        # total_reward += reward
    print(f'total reward: {reward}')
    print(f'Number of steps: {env.total_steps}')
        # env.render()

total reward: -19.99
Number of steps: 30604
total reward: -20.06
Number of steps: 32743
total reward: 40.01
Number of steps: 622
total reward: -20.06
Number of steps: 35572
total reward: -20.06
Number of steps: 33556


In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os

In [15]:
env.close()

In [6]:
env = GameEnv()

env.reset()


(array([283.04483 , 302.16772 , 125.367615,  72.67946 ], dtype=float32), 0)

In [None]:
# env = DummyVecEnv([lambda: env])

In [48]:
log_path = os.path.join('Training', 'Logs')

In [177]:
env = GameEnv()

env.reset()
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=100000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_16


-----------------------------
| time/              |      |
|    fps             | 353  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.61e+03    |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 289         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009640449 |
|    clip_fraction        | 0.0765      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | 0.251       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0167      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.004

<stable_baselines3.ppo.ppo.PPO at 0x245fab38110>

In [179]:
model.learn(total_timesteps=300000)

Logging to Training\Logs\PPO_18


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 155      |
|    ep_rew_mean     | 41.5     |
| time/              |          |
|    fps             | 303      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.25e+03    |
|    ep_rew_mean          | -59.3       |
| time/                   |             |
|    fps                  | 252         |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008069878 |
|    clip_fraction        | 0.0587      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.25       |
|    explained_variance   | -0.112      |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x245fab38110>

In [180]:
model.learn(total_timesteps=300000)

Logging to Training\Logs\PPO_19
-----------------------------
| time/              |      |
|    fps             | 343  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.57e+03    |
|    ep_rew_mean          | -166        |
| time/                   |             |
|    fps                  | 276         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.023100622 |
|    clip_fraction        | 0.425       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.27       |
|    explained_variance   | 0.968       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0367     |
|    n_updates            | 2460        |
| 

<stable_baselines3.ppo.ppo.PPO at 0x245fab38110>

In [182]:
model.learn(total_timesteps=600000)


Logging to Training\Logs\PPO_21


-----------------------------
| time/              |      |
|    fps             | 346  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.62e+03    |
|    ep_rew_mean          | 5.41        |
| time/                   |             |
|    fps                  | 285         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012137771 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.33       |
|    explained_variance   | 0.873       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0115      |
|    n_updates            | 5400        |
|    policy_gradient_loss | -0.012

<stable_baselines3.ppo.ppo.PPO at 0x245fab38110>

In [183]:
env.close()

In [8]:
baseline_path = os.path.join('Training', 'Models', 'test_baseline')

In [None]:
model.save(baseline_path)

In [None]:
model.learn(total_timesteps=10000)

In [None]:
model.learn(total_timesteps=1000000)

In [9]:
model = PPO.load(baseline_path)

In [20]:
env = GameEnv()

env.reset()
model.set_env(env=env)
model.learn(total_timesteps=100000)


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_25
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 98       |
|    ep_rew_mean     | 41       |
| time/              |          |
|    fps             | 359      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.39e+03    |
|    ep_rew_mean          | 23.9        |
| time/                   |             |
|    fps                  | 294         |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.022528637 |
|    clip_fraction        | 0.327       |
|    clip_range           | 0.2         |
|    

<stable_baselines3.ppo.ppo.PPO at 0x1913e0d1ad0>

In [17]:
env = GameEnv()
mean_r, mean_d = evaluate_policy(model, env, n_eval_episodes=10)



In [18]:
print(mean_r)
print(mean_d)

42.912998653016984
1.337789709207067


In [7]:
env.close()

In [None]:
def piecewise(x):
  y = np.zeros_like(x)
  y[x <= 10] = 0.009
  y[x > 10] = 0.0092 * np.exp(-0.0957 * (x - 10)) - 0.0002
  return y

In [None]:
%tensorboard