In [1]:
# Cargar los datos reales
import numpy as np
distances = np.loadtxt("data/distances.csv", delimiter=";")
cities = np.loadtxt("data/cities.csv", delimiter=";", dtype=str)

print(f"Distancias: {distances.shape}, Ciudades: {len(cities)}")

Distancias: (14, 14), Ciudades: 14


In [2]:
distance_matrix = np.array([
    [0, 1, 5],
    [1, 0, 2],
    [5, 2, 0]
], dtype=np.float32)

In [12]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class TSPEnv(gym.Env):
    def __init__(self, distance_matrix):
        self.distance_matrix = distance_matrix
        self.N = distance_matrix.shape[0] - 1
        self.action_space = spaces.Discrete(self.N + 1)
        self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(2 * (self.N + 1),), dtype=np.float32)
        self.cum_reward = 0

    def reset(self, seed=None, options=None):
        self.current_city = 0
        self.visited = np.zeros(self.N + 1, dtype=np.float32)
        self.visited[0] = 1
        self.total_distance = 0
        self.count = 0
        self.cum_reward = 0
        return self._get_obs(), {}

    def step(self, action):
        print(self.current_city, action)
        if action == self.current_city:
            return self._get_obs(), -10000.0, True, False, {}
        
        if (self.visited[action] and action != 0):
            return self._get_obs(), -100.0, False, False, {}

        distance = self.distance_matrix[self.current_city][action]
        reward = -distance
        self.total_distance += distance
        self.current_city = action
        self.visited[action] = 1
        self.count += 1
        self.cum_reward += reward

        done = False
        if all(self.visited) and self.current_city == 0:
            reward = 100
            done = True

        return self._get_obs(), reward, done, False, {}

    def _get_obs(self):
        current = np.zeros(self.N + 1)
        current[self.current_city] = 1
        return np.concatenate([self.visited, current], dtype=np.float32)

In [4]:
from stable_baselines3 import DQN

env = TSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=1000,
    train_freq=1,
    buffer_size=50_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=250_000)

<stable_baselines3.dqn.dqn.DQN at 0x3084657c0>

In [5]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")

ACTION: 1
REWARD: -1.0
ACTION: 2
REWARD: -2.0
ACTION: 0
REWARD: 100


In [6]:
# ========= Generar matriz aleatoria de distancias =========
def generate_random_distance_matrix(n_cities, seed=None):
    if seed is not None:
        np.random.seed(seed)
    coords = np.random.rand(n_cities, 2) * 100  # coordenadas en un mapa 100x100
    dist_matrix = np.linalg.norm(coords[:, None, :] - coords[None, :, :], axis=-1)
    return dist_matrix, coords

In [7]:
n_cities = 6  # puedes cambiarlo a 5, 7, 10...
distance_matrix, coords = generate_random_distance_matrix(n_cities, seed=42)

In [13]:
env = TSPEnv(distance_matrix)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=10000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=150
)

model.learn(total_timesteps=250_000)

0 2
2 5
5 0
0 5
0 2
0 4
4 3
3 2
3 0
0 3
0 2
0 0
0 1
1 2
2 4
4 3
3 5
5 3
5 5
0 3
3 0
0 2
2 0
0 1
1 4
4 0
0 4
0 1
0 1
0 0
0 3
3 1
1 0
0 1
0 3
0 1
0 1
0 3
0 3
0 0
0 3
3 2
2 5
5 0
0 4
4 3
4 3
4 1
1 5
1 0
0 2
2 3
3 0
0 3
0 2
0 1
1 5
5 2
5 4
4 4
0 3
3 4
4 1
1 5
5 1
5 3
5 0
0 3
0 3
0 1
0 4
0 2
2 3
2 2
0 3
3 5
5 3
5 2
2 2
0 2
2 0
0 0
0 4
4 2
2 4
2 3
3 4
3 0
0 3
0 0
0 4
4 1
1 0
0 5
5 2
2 2
0 0
0 2
2 4
4 1
1 0
0 1
0 3
3 5
5 5
0 3
3 2
2 1
1 4
4 4
0 3
3 4
4 2
2 0
0 2
0 5
5 4
5 3
5 4
5 0
0 4
0 2
0 5
0 1
1 4
1 5
1 5
1 3
1 4
1 2
1 1
0 2
2 4
4 1
1 1
0 2
2 4
4 0
0 0
0 0
0 0
0 4
4 5
5 0
0 3
3 4
3 4
3 0
0 2
2 1
1 4
1 5
1 4
1 3
1 0
0 3
3 5
5 3
5 5
0 2
2 4
4 4
0 1
1 1
0 3
3 0
0 2
2 2
0 1
1 2
2 3
3 4
4 2
4 0
0 4
0 2
0 1
0 3
0 4
0 4
0 0
0 5
5 3
3 0
0 0
0 3
3 0
0 4
4 0
0 0
0 0
0 3
3 4
4 0
0 5
5 3
5 3
5 3
5 0
0 3
0 0
0 3
3 1
1 4
4 1
4 1
4 5
5 4
5 3
5 4
5 5
0 3
3 2
2 4
4 2
4 5
5 0
0 0
0 1
1 5
5 1
5 5
0 5
5 3
3 2
2 1
1 3
1 1
0 3
3 3
0 0
0 0
0 2
2 3
3 3
0 0
0 0
0 1
1 0
0 3
3 3
0 2
2 0
0 2
0 5
5 0
0 4
4 0
0 0
0 1


<stable_baselines3.dqn.dqn.DQN at 0x3295d9af0>

In [15]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")
    print(f"DONE: {done}")

ACTION: 5
0 5
REWARD: -35.44757440621362
DONE: False
ACTION: 3
5 3
REWARD: -11.030351618150954
DONE: False
ACTION: 0
3 0
REWARD: -32.75536921225664
DONE: False
ACTION: 1
0 1
REWARD: -50.17136010769449
DONE: False
ACTION: 0
1 0
REWARD: -50.17136010769449
DONE: False
ACTION: 4
0 4
REWARD: -33.19807081074011
DONE: False
ACTION: 0
4 0
REWARD: -33.19807081074011
DONE: False
ACTION: 2
0 2
REWARD: -82.4215490573505
DONE: False
ACTION: 0
2 0
REWARD: 100
DONE: True


In [17]:
distances
env = TSPEnv(distances)

model = DQN(
    "MlpPolicy",
    env,
    verbose=0,
    exploration_fraction=1.0,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=1000,
    train_freq=1,
    buffer_size=10_000,
    batch_size=64,
    gamma=0.95,
    target_update_interval=250
)

model.learn(total_timesteps=150_000)

<stable_baselines3.dqn.dqn.DQN at 0x31aea3830>

In [18]:
model.learn(total_timesteps=150_000)

<stable_baselines3.dqn.dqn.DQN at 0x31aea3830>

In [22]:
model.learn(total_timesteps=950_000)

<stable_baselines3.dqn.dqn.DQN at 0x31aea3830>

In [21]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    print(f"ACTION: {action}")
    obs, reward, done, _, info = env.step(action)
    print(f"REWARD: {reward}")

ACTION: 8
REWARD: -58.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
ACTION: 8
REWARD: -100.0
A

KeyboardInterrupt: 