In [5]:
# Cargar los datos reales
import numpy as np
distances = np.loadtxt("data/distances.csv", delimiter=";")
cities = np.loadtxt("data/cities.csv", delimiter=";", dtype=str)

print(f"Distancias: {distances.shape}, Ciudades: {len(cities)}")

Distancias: (14, 14), Ciudades: 14


In [5]:
distance_matrix = np.array([
    [0, 1, 5],
    [1, 0, 2],
    [5, 2, 0]
], dtype=np.float32)

In [12]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class MultiCarTSPEnv(gym.Env):
    def __init__(self, distance_matrix, num_vehicles=2):
        self.distance_matrix = distance_matrix
        self.N = distance_matrix.shape[0] - 1  # number of cities
        self.num_vehicles = num_vehicles
        self.action_space = spaces.Discrete(self.N + 1)
        
        # visited (N+1), current city per vehicle (num_vehicles), active vehicle one-hot (num_vehicles)
        self.observation_space = spaces.Box(
            low=0.0,
            high=1.0,
            shape=(self.N + 1 + self.num_vehicles + self.num_vehicles,),
            dtype=np.float32
        )

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.visited = np.zeros(self.N + 1, dtype=np.float32)
        self.vehicle_positions = np.random.choice(self.N + 1, self.num_vehicles, replace=True)
        self.vehicle_rewards = np.zeros(self.num_vehicles, dtype=np.float32)
        self.vehicle_routes = [[] for _ in range(self.num_vehicles)]
        self.active_vehicle = 0
        for pos in self.vehicle_positions:
            self.visited[pos] = 1
        return self._get_obs(), {}

    def step(self, action):
        vehicle = self.active_vehicle
        current_city = self.vehicle_positions[vehicle]
        reward = 0.0
        done = False

        if action == current_city:
            return self._get_obs(), -100, False, False, {}

        if action == current_city or self.visited[action]:
            reward = -100.0
            done = False
        else:
            distance = self.distance_matrix[current_city][action]
            reward = -distance
            self.vehicle_positions[vehicle] = action
            self.visited[action] = 1
            self.vehicle_routes[vehicle].append(action)
            self.vehicle_rewards[vehicle] += reward

        # Check if all cities have been visited
        if all(self.visited):
            done = True
            reward += 100.0  # global bonus

        # Switch turn
        self.active_vehicle = (self.active_vehicle + 1) % self.num_vehicles
        return self._get_obs(), reward, done, False, {}

    def _get_obs(self):
        active_vehicle_one_hot = np.zeros(self.num_vehicles, dtype=np.float32)
        active_vehicle_one_hot[self.active_vehicle] = 1.0

        current_positions = np.zeros(self.num_vehicles, dtype=np.float32)
        for i, pos in enumerate(self.vehicle_positions):
            current_positions[i] = pos / self.N  # normalize

        return np.concatenate([
            self.visited,
            current_positions,
            active_vehicle_one_hot
        ]).astype(np.float32)

In [17]:
# Example
distance_matrix = np.random.randint(1, 100, size=(6, 6)).astype(np.float32)
np.fill_diagonal(distance_matrix, 0)

env = MultiCarTSPEnv(distance_matrix, num_vehicles=2)

obs, _ = env.reset()
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, _, info = env.step(action)
    print(f"Reward: {reward}")

Reward: -27.0
Reward: -48.0
Reward: -49.0
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: -100
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: -100.0
Reward: 68.0


In [18]:
# ========= Generar matriz aleatoria de distancias =========
def generate_random_distance_matrix(n_cities, seed=None):
    if seed is not None:
        np.random.seed(seed)
    coords = np.random.rand(n_cities, 2) * 100  # coordenadas en un mapa 100x100
    dist_matrix = np.linalg.norm(coords[:, None, :] - coords[None, :, :], axis=-1)
    return dist_matrix, coords

In [19]:
from stable_baselines3 import DQN

In [20]:
n_cities = 6  # puedes cambiarlo a 5, 7, 10...
distance_matrix, coords = generate_random_distance_matrix(n_cities, seed=42)

In [21]:


env = MultiCarTSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=10_000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=250_000)

<stable_baselines3.dqn.dqn.DQN at 0x108896a50>

In [22]:
obs, _ = env.reset(options = {"start_city":1})
done = False
print(obs)
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"OBS: {obs}")
    print(f"REWARD: {reward}")

[0.  0.  1.  0.  0.  0.  0.4 0.4 1.  0. ]
ACTION: 4
OBS: [0.  0.  1.  0.  1.  0.  0.8 0.4 0.  1. ]
REWARD: -70.91551038198544
ACTION: 3
OBS: [0.  0.  1.  1.  1.  0.  0.8 0.6 1.  0. ]
REWARD: -71.69025114155171
ACTION: 1
OBS: [0.  1.  1.  1.  1.  0.  0.2 0.6 0.  1. ]
REWARD: -17.05893848218244
ACTION: 5
OBS: [0.  1.  1.  1.  1.  1.  0.2 1.  1.  0. ]
REWARD: -11.030351618150954
ACTION: 0
OBS: [1. 1. 1. 1. 1. 1. 0. 1. 0. 1.]
REWARD: 49.82863989230551


In [13]:
# ========= Generar matriz aleatoria de distancias =========
def generate_random_distance_matrix(n_cities, seed=None):
    if seed is not None:
        np.random.seed(seed)
    coords = np.random.rand(n_cities, 2) * 100  # coordenadas en un mapa 100x100
    dist_matrix = np.linalg.norm(coords[:, None, :] - coords[None, :, :], axis=-1)
    return dist_matrix, coords

In [14]:
n_cities = 6  # puedes cambiarlo a 5, 7, 10...
distance_matrix, coords = generate_random_distance_matrix(n_cities, seed=42)

In [15]:
env = TSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=10000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=150_000)

<stable_baselines3.dqn.dqn.DQN at 0x319356390>

In [26]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")

ACTION: 1
REWARD: -50.17136010769449
ACTION: 4
REWARD: -17.05893848218244
ACTION: 0
REWARD: -33.19807081074011
ACTION: 5
REWARD: -35.44757440621362
ACTION: 0
REWARD: -35.44757440621362
ACTION: 2
REWARD: -82.4215490573505
ACTION: 0
REWARD: -82.4215490573505
ACTION: 3
REWARD: -32.75536921225664
ACTION: 0
REWARD: 67.24463078774336


In [13]:
env.observation_space

Box(0.0, 1.0, (12,), float32)