In [5]:
# Cargar los datos reales
import numpy as np
distances = np.loadtxt("data/distances.csv", delimiter=";")
cities = np.loadtxt("data/cities.csv", delimiter=";", dtype=str)

print(f"Distancias: {distances.shape}, Ciudades: {len(cities)}")

Distancias: (14, 14), Ciudades: 14


In [5]:
distance_matrix = np.array([
    [0, 1, 5],
    [1, 0, 2],
    [5, 2, 0]
], dtype=np.float32)

In [8]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class TSPEnv(gym.Env):
    def __init__(self, distance_matrix):
        self.distance_matrix = distance_matrix
        self.N = distance_matrix.shape[0] - 1  # número de ciudades
        self.action_space = spaces.Discrete(self.N + 1)
        self.observation_space = spaces.Box(
            low=0.0, high=1.0, shape=(2 * (self.N + 1),), dtype=np.float32
        )
        self.cum_reward = 0

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if options and "start_city" in options:
            self.start_city = options["start_city"]
        else:
            self.start_city = np.random.randint(0, self.N + 1)

        self.current_city = self.start_city
        self.visited = np.zeros(self.N + 1, dtype=np.float32)
        self.visited[self.start_city] = 1
        self.total_distance = 0
        self.count = 0
        self.cum_reward = 0
        return self._get_obs(), {}

    def step(self, action):
        done = False

        if action == self.current_city or (self.visited[action] and action != self.start_city):
            return self._get_obs(), -100.0, True, False, {}

        distance = self.distance_matrix[self.current_city][action]
        reward = -distance
        self.total_distance += distance
        self.current_city = action
        self.cum_reward += reward

        if action != self.start_city:
            self.visited[action] = 1

        self.count += 1

        if all(self.visited) and self.current_city == self.start_city:
            reward += - self.cum_reward
            done = True

        return self._get_obs(), reward, done, False, {}

    def _get_obs(self):
        one_hot_current = np.zeros(self.N + 1, dtype=np.float32)
        one_hot_current[self.current_city] = 1
        return np.concatenate([self.visited, one_hot_current])

In [11]:
from stable_baselines3 import DQN

In [12]:


env = TSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=1000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

#model.learn(total_timesteps=150_000)

In [None]:
model.learn(total_timesteps=150_000)

In [14]:
obs, _ = env.reset(options = {"start_city":1})
done = False
print(obs)
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"OBS: {obs}")
    print(f"REWARD: {reward}")

[0. 1. 0. 0. 1. 0.]
ACTION: 0
OBS: [1. 1. 0. 1. 0. 0.]
REWARD: -1.0
ACTION: 1
OBS: [1. 1. 0. 0. 1. 0.]
REWARD: -1.0
ACTION: 2
OBS: [1. 1. 1. 0. 0. 1.]
REWARD: -2.0
ACTION: 0
OBS: [1. 1. 1. 0. 0. 1.]
REWARD: -100.0


In [13]:
# ========= Generar matriz aleatoria de distancias =========
def generate_random_distance_matrix(n_cities, seed=None):
    if seed is not None:
        np.random.seed(seed)
    coords = np.random.rand(n_cities, 2) * 100  # coordenadas en un mapa 100x100
    dist_matrix = np.linalg.norm(coords[:, None, :] - coords[None, :, :], axis=-1)
    return dist_matrix, coords

In [14]:
n_cities = 6  # puedes cambiarlo a 5, 7, 10...
distance_matrix, coords = generate_random_distance_matrix(n_cities, seed=42)

In [15]:
env = TSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=10000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=150_000)

<stable_baselines3.dqn.dqn.DQN at 0x319356390>

In [26]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")

ACTION: 1
REWARD: -50.17136010769449
ACTION: 4
REWARD: -17.05893848218244
ACTION: 0
REWARD: -33.19807081074011
ACTION: 5
REWARD: -35.44757440621362
ACTION: 0
REWARD: -35.44757440621362
ACTION: 2
REWARD: -82.4215490573505
ACTION: 0
REWARD: -82.4215490573505
ACTION: 3
REWARD: -32.75536921225664
ACTION: 0
REWARD: 67.24463078774336


In [13]:
env.observation_space

Box(0.0, 1.0, (12,), float32)