In [34]:
# Cargar los datos reales
import numpy as np
distances = np.loadtxt("data/distances.csv", delimiter=";")
cities = np.loadtxt("data/cities.csv", delimiter=";", dtype=str)

print(f"Distancias: {distances.shape}, Ciudades: {len(cities)}")

Distancias: (14, 14), Ciudades: 14


In [35]:
distance_matrix = np.array([
    [0, 1, 5],
    [1, 0, 2],
    [5, 2, 0]
], dtype=np.float32)

In [36]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class TSPEnv(gym.Env):
    def __init__(self, distance_matrix):
        self.distance_matrix = distance_matrix
        self.N = distance_matrix.shape[0] - 1
        self.action_space = spaces.Discrete(self.N + 1)
        self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(2 * (self.N + 1),), dtype=np.float32)

    def reset(self, seed=None, options=None):
        self.current_city = 0
        self.visited = np.zeros(self.N + 1, dtype=np.float32)
        self.visited[0] = 1
        self.total_distance = 0
        self.count = 0
        return self._get_obs(), {}

    def step(self, action):
        if action == self.current_city or (self.visited[action] and action != 0):
            return self._get_obs(), -100.0, False, False, {}

        distance = self.distance_matrix[self.current_city][action]
        reward = -distance
        self.total_distance += distance
        self.current_city = action
        self.visited[action] = 1
        self.count += 1

        done = False
        if all(self.visited) and self.current_city == 0:
            reward += 100.0
            done = True

        return self._get_obs(), reward, done, False, {}

    def _get_obs(self):
        current = np.zeros(self.N + 1)
        current[self.current_city] = 1
        return np.concatenate([self.visited, current], dtype=np.float32)

In [37]:
from stable_baselines3 import DQN

env = TSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=1000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=50_000)

<stable_baselines3.dqn.dqn.DQN at 0x322f97c50>

In [38]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")

ACTION: 1
REWARD: -1.0
ACTION: 2
REWARD: -2.0
ACTION: 0
REWARD: 95.0


In [39]:
# ========= Generar matriz aleatoria de distancias =========
def generate_random_distance_matrix(n_cities, seed=None):
    if seed is not None:
        np.random.seed(seed)
    coords = np.random.rand(n_cities, 2) * 100  # coordenadas en un mapa 100x100
    dist_matrix = np.linalg.norm(coords[:, None, :] - coords[None, :, :], axis=-1)
    return dist_matrix, coords

In [40]:
n_cities = 6  # puedes cambiarlo a 5, 7, 10...
distance_matrix, coords = generate_random_distance_matrix(n_cities, seed=42)

In [46]:
env = TSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=80000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=450_000)

<stable_baselines3.dqn.dqn.DQN at 0x3240677d0>

In [47]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")

ACTION: 4
REWARD: -33.19807081074011
ACTION: 1
REWARD: -17.05893848218244
ACTION: 2
REWARD: -72.64288903176
ACTION: 3
REWARD: -71.69025114155171
ACTION: 5
REWARD: -11.030351618150954
ACTION: 0
REWARD: 64.55242559378638


In [17]:
distances
env = TSPEnv(distances)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=1000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=150_000)

<stable_baselines3.dqn.dqn.DQN at 0x31aea3830>

In [18]:
model.learn(total_timesteps=150_000)

<stable_baselines3.dqn.dqn.DQN at 0x31aea3830>

In [22]:
model.learn(total_timesteps=950_000)

<stable_baselines3.dqn.dqn.DQN at 0x31aea3830>

In [23]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")

ACTION: 8
REWARD: -58.0
ACTION: 0
REWARD: -58.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
AC

KeyboardInterrupt: 