# Q - Learning for the TSP 

In [1]:
import numpy as np 
from src import model, utils

In [2]:
dir(utils)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'njit',
 'np',
 'plt',
 'route_distance',
 'trace_progress']

In [3]:
dir(model)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'njit',
 'np']

In [6]:
dist_matrix_15 = np.loadtxt("data/tsp_15_291.txt") #15 cities and min cost = 291
dist_matrix_26 = np.loadtxt("data/tsp_26_937.txt")
dist_matrix_17 = np.loadtxt("data/tsp_17_2085.txt")
dist_matrix_42 = np.loadtxt("data/tsp_42_699.txt")
dist_matrix_48 = np.loadtxt("data/tsp_48_33523.txt")

In [7]:
Q_table = np.zeros((15,15))
model.QLearning(Q_table,
              dist_matrix_15,
              1, 
              0.88, 
              0.1, 
              0.0001, 
              epochs=1000)

AttributeError: module 'src.model' has no attribute 'QLearning'

In [24]:
class QLearnTSP:
    def __init__(self, dist):
        self.len_path = dist.shape[0]
        self.dist = dist
        #reward matrix, and Q matrix under the form of dictionnaries
        #--------- should be optimized --------- #
        start = 0 
        end = 0 
        rewards = {}
        q = {}
        states = ["start"] + [i for i in range(1, self.len_path)] + ["end"]
        for s in states:
            for n in states:
                if s == n: 
                    q[(s,n)] = -np.inf
                    continue
                q[(s,n)]=0
                s_i, n_i = s, n
                if type(s) == str:
                    s_i = 0
                if type(n) == str:
                    n_i = 0
                if s_i == n_i: 
                    continue    

                rewards[(s_i, n_i)] = -dist[s_i, n_i]
                if n_i == 0:
                    rewards[(s_i, n_i)] = 100 - dist[s_i, n_i]  
        self.rewards = rewards
        self.q = q
    
    

    def follow_path(self, q_temp):
        path = ['start']
        while 'end' not in path:
            possible_next = set([i for i in range(1, self.len_path)]) - set(path)
            if possible_next == set():
                path.append('end')
                continue
            next_state = list(possible_next)[0]
            r = q_temp[(path[-1], next_state)]
            for ele in possible_next:
                if q_temp[(path[-1], ele)] > r:
                    next_state = ele
                    r = q_temp[(path[-1], ele)]
            path.append(next_state)
        return [0 if x in ['start', 'end'] else x for x in path]

    
    
    def get_cost(self, path):
        c = 0
        for i in range(1, len(path)):
            c += self.dist[path[i-1], path[i]]
        return c

    def qlearning(self, epsilon, gamma, lr, lbda, epochs = 100):
        start = 0
        eps = epsilon
        q_learn = self.q.copy()
        for t in range(epochs):
            if t%(epochs/10) == 0:
                print("Iteration :", t, "||", "epsilon = ", round(eps * (1-lbda), 2), "||", "Current cost :", self.get_cost(self.follow_path(q_learn)))

            current_state = start
            current_path = [start]
            eps = eps * (1-lbda)
            while len(current_path) < self.len_path+1:
                if len(current_path) == self.len_path:
                    current_path.append(0)
                    #do 

                    #
                    continue
                current_state = current_path[-1]
                possible_next = set([i for i in range(1, self.len_path)]) - set(current_path)
                u = np.random.random()
                if u < eps:
                    next_state = np.random.choice(list(possible_next))
                else:
                    key_next = [(current_state, n) for n in possible_next]
                    next_state = list(possible_next)[0]
                    r = self.rewards[(current_state, next_state)]
                    for e in key_next:
                        if self.rewards[e] > r:
                            next_state = e[1]
                            r = self.rewards[e]
                current_path.append(next_state)
                #updating Q
                c_s, n_s, c_s_i, n_s_i = current_state, next_state, current_state, next_state
                if current_state == 0:
                    c_s_i = 'start'
                if next_state == 0:
                    n_s_i = 'end'
                possible_next = set([i for i in range(1, self.len_path)]) - set(current_path) - set([next_state])
                if possible_next == set():
                    continue

                max_r_n = list(possible_next)[0]
                m_r = q_learn[(c_s_i, n_s_i)]
                for ele in possible_next.union(set(["end"])):
                    if q_learn[(c_s_i, ele)] > m_r:
                        max_r_n = ele
                        m_r = q_learn[(c_s_i, ele)] 

                q_learn[(c_s_i, n_s_i)] = q_learn[(c_s_i, n_s_i)] + lr * ((self.rewards[(c_s, n_s)] + gamma * m_r - q_learn[(c_s_i, n_s_i)]))
        return q_learn
            

In [20]:
dist_matrix_15 = np.loadtxt("data/tsp_15_291.txt") #15 cities and min cost = 291
dist_matrix_26 = np.loadtxt("data/tsp_26_937.txt")
dist_matrix_17 = np.loadtxt("data/tsp_17_2085.txt")
dist_matrix_42 = np.loadtxt("data/tsp_42_699.txt")
dist_matrix_48 = np.loadtxt("data/tsp_48_33523.txt")

In [50]:
solver = QLearnTSP(dist_matrix_15)

In [51]:
%time Q_15 = solver.qlearning(1, 0.88, 0.1, 0.0001, 4000)

Iteration : 0 || epsilon =  1.0 || Current cost : 817.0
Iteration : 400 || epsilon =  0.96 || Current cost : 291.0
Iteration : 800 || epsilon =  0.92 || Current cost : 291.0
Iteration : 1200 || epsilon =  0.89 || Current cost : 291.0
Iteration : 1600 || epsilon =  0.85 || Current cost : 291.0
Iteration : 2000 || epsilon =  0.82 || Current cost : 291.0
Iteration : 2400 || epsilon =  0.79 || Current cost : 291.0
Iteration : 2800 || epsilon =  0.76 || Current cost : 291.0
Iteration : 3200 || epsilon =  0.73 || Current cost : 291.0
Iteration : 3600 || epsilon =  0.7 || Current cost : 291.0
Wall time: 1.86 s


In [52]:
solver = QLearnTSP(dist_matrix_17)

In [53]:
%time Q_17 = solver.qlearning(1, 0.88, 0.1, 0.0001, 10000)

Iteration : 0 || epsilon =  1.0 || Current cost : 4517.0
Iteration : 1000 || epsilon =  0.9 || Current cost : 2199.0
Iteration : 2000 || epsilon =  0.82 || Current cost : 2199.0
Iteration : 3000 || epsilon =  0.74 || Current cost : 2199.0
Iteration : 4000 || epsilon =  0.67 || Current cost : 2187.0
Iteration : 5000 || epsilon =  0.61 || Current cost : 2187.0
Iteration : 6000 || epsilon =  0.55 || Current cost : 2187.0
Iteration : 7000 || epsilon =  0.5 || Current cost : 2187.0
Iteration : 8000 || epsilon =  0.45 || Current cost : 2187.0
Iteration : 9000 || epsilon =  0.41 || Current cost : 2187.0
Wall time: 5.1 s


In [54]:
solver = QLearnTSP(dist_matrix_26)

In [55]:
%time Q_26 = solver.qlearning(1, 0.88, 0.2, 0.001, 10000)

Iteration : 0 || epsilon =  1.0 || Current cost : 1159.0
Iteration : 1000 || epsilon =  0.37 || Current cost : 1050.0
Iteration : 2000 || epsilon =  0.14 || Current cost : 1050.0
Iteration : 3000 || epsilon =  0.05 || Current cost : 1050.0
Iteration : 4000 || epsilon =  0.02 || Current cost : 1050.0
Iteration : 5000 || epsilon =  0.01 || Current cost : 1050.0
Iteration : 6000 || epsilon =  0.0 || Current cost : 1050.0
Iteration : 7000 || epsilon =  0.0 || Current cost : 1050.0
Iteration : 8000 || epsilon =  0.0 || Current cost : 1050.0
Iteration : 9000 || epsilon =  0.0 || Current cost : 1050.0
Wall time: 5.76 s


In [56]:
solver = QLearnTSP(dist_matrix_42)

In [57]:
%time Q_42 = solver.qlearning(1, 0.8, 0.1, 0.0001, 10000)

Iteration : 0 || epsilon =  1.0 || Current cost : 962.0
Iteration : 1000 || epsilon =  0.9 || Current cost : 916.0
Iteration : 2000 || epsilon =  0.82 || Current cost : 955.0
Iteration : 3000 || epsilon =  0.74 || Current cost : 955.0
Iteration : 4000 || epsilon =  0.67 || Current cost : 955.0
Iteration : 5000 || epsilon =  0.61 || Current cost : 955.0
Iteration : 6000 || epsilon =  0.55 || Current cost : 955.0
Iteration : 7000 || epsilon =  0.5 || Current cost : 955.0
Iteration : 8000 || epsilon =  0.45 || Current cost : 955.0
Iteration : 9000 || epsilon =  0.41 || Current cost : 955.0
Wall time: 19.1 s


In [58]:
solver = QLearnTSP(dist_matrix_48)

In [59]:
%time Q_48 = solver.qlearning(1, 0.8, 0.1, 0.0001, 4000)

Iteration : 0 || epsilon =  1.0 || Current cost : 151722.0
Iteration : 400 || epsilon =  0.96 || Current cost : 59238.0
Iteration : 800 || epsilon =  0.92 || Current cost : 41452.0
Iteration : 1200 || epsilon =  0.89 || Current cost : 38970.0
Iteration : 1600 || epsilon =  0.85 || Current cost : 40610.0
Iteration : 2000 || epsilon =  0.82 || Current cost : 40551.0
Iteration : 2400 || epsilon =  0.79 || Current cost : 40551.0
Iteration : 2800 || epsilon =  0.76 || Current cost : 40610.0
Iteration : 3200 || epsilon =  0.73 || Current cost : 40551.0
Iteration : 3600 || epsilon =  0.7 || Current cost : 40551.0
Wall time: 10.1 s
