# Trace route

This script is responsible trace an optimized route!

In [1]:
# SET CONSTANTS
INTERVAL_BY_SECONDS_PRED = 60*10
DAYS_TO_PREDICT = 2

In [2]:
%pip install psycopg2-binary

Note: you may need to restart the kernel to use updated packages.


In [3]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [4]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import explode, from_unixtime, col, to_date, sum, avg, udf, lit, date_trunc, when, max
from pyspark.sql.types import DateType, TimestampType, StructType, StructField, IntegerType, FloatType, StringType

import requests
import json
from collections import defaultdict
import math
import random
import os
import re
import string
from glob import glob
from datetime import datetime, timedelta, date, tzinfo, timezone
import psycopg2

DB_URL = "jdbc:postgresql://postgres:5432/themeparkwizard"
PROPERTIES_CUSTOM = {"user": os.environ['POSTGRES_USER'],"password": os.environ['POSTGRES_PASSWORD'], "driver": "org.postgresql.Driver"}

spark = SparkSession.builder \
    .appName("MetricPredict") \
    .config("spark.jars", "jars/postgresql-42.7.7.jar") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

In [5]:
conn = psycopg2.connect(
        host="postgres",
        port='5432',
        database="themeparkwizard",
        user=os.environ['POSTGRES_USER'],
        password=os.environ['POSTGRES_PASSWORD']
    )

In [6]:
with conn.cursor() as cur:
    cur.execute("""SELECT distinct id FROM themeparkwizard.dim_park_entity""")
    park_id_list = cur.fetchall()
    predicted_data = {}
    for park_id in park_id_list:
        cur.execute(f"""
    with number_row as (
        select
            entity_id,
            name,
            entity_name,
            latitude,
            longitude,
            wait_time,
            extracted_at_time,
            rating,
            row_number() over (partition by entity_id order by extracted_at_time) as rn
        from themeparkwizard.predictions_table pt
        left join themeparkwizard.dim_park_entity dpe using(entity_id)
        where was_predicted = 1 and dpe.id = '{park_id[0]}'
    ),
    avg_by_entity AS (
        SELECT
            entity_id,
            AVG(avg_standby_waittime) as alltime_avg_waittime
        FROM themeparkwizard.agg_avg_time
        GROUP BY 1
    ),
    first_group as (
        select extracted_at_time,
               wait_time,
               entity_name,
               latitude,
               longitude,
               name,
               rating,
               alltime_avg_waittime
        from number_row
                 left join avg_by_entity
                           using (entity_id)
        where rn <> 1
        order by extracted_at_time, entity_name
    )
    select
        extracted_at_time,
        a.entity_name as src_node,
        b.entity_name as dst_node,
        SQRT(POWER((b.latitude - a.latitude)*111, 2) + POWER((b.longitude - a.longitude)*111, 2)) AS euclidean_distance,
        b.wait_time,
        b.alltime_avg_waittime,
        b.rating
    from first_group a
    full join first_group b
    using(extracted_at_time, name)
    order by 1,2,3
        """)
    # extracted_at_time datetime 0
    # source_node (A) string 1
    # destination_node (B) string 2
    # euclidean distance (from A to B) float 3
    # wait time (queue B) float 4
    # all time wait time (queue B) float 5
    # rating (B) float 6
        predicted_data[park_id[0]] = cur.fetchall()

In [7]:
# len(predicted_data)
for k,v in predicted_data.items():
    print(k, len(v), sep='->')

75ea578a-adc8-4116-a54d-dccb60765ef9->45056
eb3f4560-2383-4a36-9152-6b3e5ed6bc57->0
bc4005c5-8c7e-41d7-b349-cdddf1796427->0
47f90d2c-e191-4239-a466-5892ef59a88b->12312
288747d1-8b4f-4a64-867e-ea7c9b27bad8->9882
1c84a229-8862-4648-9c71-378ddd2c7693->2432
267615cc-8943-4c2a-ae2c-5da728ca591f->0


In [8]:
predicted_data['1c84a229-8862-4648-9c71-378ddd2c7693'][:3]

[(datetime.datetime(2025, 8, 17, 11, 30),
  'Avatar Flight of Passage',
  'Avatar Flight of Passage',
  0.0,
  88.52124,
  70.63686475267764,
  4.8),
 (datetime.datetime(2025, 8, 17, 11, 30),
  'Avatar Flight of Passage',
  'DINOSAUR',
  0.4119463787656751,
  5.1826177,
  22.41886353685162,
  4.5),
 (datetime.datetime(2025, 8, 17, 11, 30),
  'Avatar Flight of Passage',
  'Expedition Everest - Legend of the Forbidden Mountain',
  0.6208606520284927,
  1.0,
  31.603592562012242,
  4.8)]

In [9]:
def calculate_weight(rating: float, queue_i: float, dist: float, queue_avg: float):
    value = 4**(10/rating)*(queue_i*dist)/queue_avg
    return value, round(queue_i + 5, -1)

def fill_time_matrix(matrix, len_a, bad_node):
    new_matrix = []
    for i in range(len_a):
        new_matrix.append([None]*len_a)
        for j in range(len_a):
            try:
                new_matrix[i][j] = matrix[i][j]
            except IndexError:
                new_matrix[i][j] = bad_node
    return new_matrix

def create_map_attr(query_result):
    map_attr = defaultdict(dict)

    for park_id in query_result:
        end_creation = False
        dest_node = ''
        attr_idx = 0
        for row in query_result[park_id]:
        # Fill map_attractions with a symbol
            for i in range(1,3):
                if not row[i] in [k for k, _ in map_attr[park_id].values()]:
                    key_map = ''.join(random.choices(string.ascii_uppercase, k=2))
                    map_attr[park_id][key_map] = (row[i],attr_idx)
                    attr_idx += 1
            if row[1] != row[2] and not end_creation:
                dest_node = row[2]
                end_creation = True
            if row[1] == dest_node:
                break
    return map_attr

def create_cost_by_time(query_result, len_attr):
    INF_MAX = math.inf
    bad_node = (INF_MAX, 60*24*10)
    cost_time = defaultdict(dict)
    last_node = ''

    for park_id in query_result:
        tmp_range = []
        for row in query_result[park_id]:
            # Next row on matrix!
            if last_node != row[1]:
                if last_node != '':
                    tmp_range.append(inner_tmp_range.copy())
                last_node = row[1]
                inner_tmp_range = []
            # Checkout date matrix
            if not cost_time[park_id].get(row[0]):
                if tmp_range:
                    cost_time[park_id][row[0]] = fill_time_matrix(tmp_range, len_attr[park_id], bad_node)
                tmp_range = []
            # Calculate cost
            if row[1] == row[2]:
                inner_tmp_range.append(bad_node)
            else:
                inner_tmp_range.append(calculate_weight(row[6],row[4],row[3],row[5]))
    return cost_time

In [10]:
map_attractions = create_map_attr(predicted_data)
len_attractions = {k:len(map_attractions[k]) for k in map_attractions}
map_by_time = create_cost_by_time(predicted_data, len_attractions)


In [11]:
predicted_data['1c84a229-8862-4648-9c71-378ddd2c7693'][:3]

[(datetime.datetime(2025, 8, 17, 11, 30),
  'Avatar Flight of Passage',
  'Avatar Flight of Passage',
  0.0,
  88.52124,
  70.63686475267764,
  4.8),
 (datetime.datetime(2025, 8, 17, 11, 30),
  'Avatar Flight of Passage',
  'DINOSAUR',
  0.4119463787656751,
  5.1826177,
  22.41886353685162,
  4.5),
 (datetime.datetime(2025, 8, 17, 11, 30),
  'Avatar Flight of Passage',
  'Expedition Everest - Legend of the Forbidden Mountain',
  0.6208606520284927,
  1.0,
  31.603592562012242,
  4.8)]

In [12]:
map_attractions

defaultdict(dict,
            {'75ea578a-adc8-4116-a54d-dccb60765ef9': {'FG': ('Astro Orbiter',
               0),
              'XK': ("Buzz Lightyear's Space Ranger Spin", 1),
              'NK': ('Dumbo the Flying Elephant', 2),
              'TG': ('"it\'s a small world"', 3),
              'IU': ('Mad Tea Party', 4),
              'UN': ("Mickey's PhilharMagic", 5),
              'ZO': ("Peter Pan's Flight", 6),
              'ID': ('Prince Charming Regal Carrousel', 7),
              'TW': ('Seven Dwarfs Mine Train', 8),
              'CF': ('Space Mountain', 9),
              'NG': ('The Barnstormer', 10),
              'KJ': ('The Many Adventures of Winnie the Pooh', 11),
              'MH': ('Tomorrowland Speedway', 12),
              'MN': ('Tomorrowland Transit Authority PeopleMover', 13),
              'BP': ('Under the Sea - Journey of The Little Mermaid', 14),
              'YW': ("Walt Disney's Carousel of Progress", 15)},
             '47f90d2c-e191-4239-a466-5892ef59a8

In [13]:
map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693'][datetime(2025, 8, 17, 11, 40)]

[[(inf, 14400),
  (2.0734203746683217, 10.0),
  (0.35281686046200456, 10.0),
  (0.031154141956775357, 10.0)],
 [(9.271466049514272, 90.0),
  (inf, 14400),
  (0.21095242182640433, 10.0),
  (0.16977655532441008, 10.0)],
 [(13.97339254203222, 90.0),
  (1.8684263935765935, 10.0),
  (inf, 14400),
  (0.2836661560590989, 10.0)],
 [(1.4657655901855842, 90.0),
  (1.7863452310816772, 10.0),
  (0.33697968724890964, 10.0),
  (inf, 14400)]]

In [14]:
map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693'][datetime(2025, 8, 17, 11, 30)]

[[(0.9786283937096293, 10.0),
  (3.0059052871456076, 50.0),
  (0.8060600673978527, 10.0),
  (5.277188476128824, 40.0)],
 [(inf, 14400), (inf, 14400), (inf, 14400), (inf, 14400)],
 [(inf, 14400), (inf, 14400), (inf, 14400), (inf, 14400)],
 [(inf, 14400), (inf, 14400), (inf, 14400), (inf, 14400)]]

In [15]:
import genetic_algorithm_tour as gat

POPULATION_SIZE = 10
NUM_GENERATIONS = 30
CROSSOVER_RATE = 0.8
MUTATION_RATE = 0.01

best_gene, best_cost = gat.genetic_algorithm(
    pop_size=POPULATION_SIZE,
    num_generations=10,
    allele_map=map_attractions['1c84a229-8862-4648-9c71-378ddd2c7693'],
    predicted_map=map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693'],
    crossover_rate=CROSSOVER_RATE,
    mutation_rate=MUTATION_RATE
)

['YG', 'ZL', 'ZM', 'IF']
2025-08-17 11:30:00
[['IF', 'ZL', 'YG', 'ZM'], ['YG', 'IF', 'ZM', 'ZL'], ['ZL', 'YG', 'IF', 'ZM'], ['IF', 'ZL', 'YG', 'ZM'], ['IF', 'ZL', 'ZM', 'YG'], ['YG', 'ZM', 'ZL', 'IF'], ['ZM', 'IF', 'ZL', 'YG'], ['ZM', 'IF', 'YG', 'ZL'], ['YG', 'IF', 'ZM', 'ZL'], ['ZM', 'IF', 'ZL', 'YG']]
[inf, 6.904496298432806, inf, inf, inf, 2.8442630162988563, inf, inf, 6.904496298432806, inf] | 5 | [['YG', 'ZM', 'ZL', 'IF'], ['YG', 'IF', 'ZM', 'ZL'], ['YG', 'ZM', 'ZL', 'IF'], ['YG', 'IF', 'ZM', 'ZL'], ['YG', 'IF', 'ZM', 'ZL'], ['YG', 'ZM', 'ZL', 'IF'], ['YG', 'IF', 'ZM', 'ZL'], ['YG', 'ZM', 'ZL', 'IF'], ['YG', 'IF', 'ZM', 'ZL'], ['YG', 'ZM', 'ZL', 'IF']]
Generation 1:
Best Fitness to f(r, q, d) = 5
Best individual is ['YG', 'ZM', 'ZL', 'IF']
[2.8442630162988563, 6.904496298432806, 2.8442630162988563, 6.904496298432806, 6.904496298432806, 2.8442630162988563, 6.904496298432806, 2.8442630162988563, 6.904496298432806, 2.8442630162988563] | 0 | [['YG', 'ZM', 'ZL', 'IF'], ['YG', 'ZM', 'Z

In [16]:
for e in range(10):
    print(gat.crossover(['HI', 'IH', 'OM', 'LS'], ['LS', 'OM', 'HI', 'IH']))

(['IH', 'OM', 'HI', 'LS'], ['LS', 'HI', 'OM', 'IH'])
(['LS', 'OM', 'HI', 'IH'], ['HI', 'IH', 'OM', 'LS'])
(['LS', 'OM', 'HI', 'IH'], ['HI', 'IH', 'LS', 'OM'])
(['LS', 'IH', 'OM', 'HI'], ['HI', 'OM', 'IH', 'LS'])
(['LS', 'OM', 'HI', 'IH'], ['HI', 'IH', 'OM', 'LS'])
(['IH', 'OM', 'HI', 'LS'], ['LS', 'HI', 'OM', 'IH'])
(['LS', 'IH', 'OM', 'HI'], ['HI', 'OM', 'IH', 'LS'])
(['LS', 'OM', 'HI', 'IH'], ['HI', 'IH', 'OM', 'LS'])
(['IH', 'OM', 'LS', 'HI'], ['OM', 'IH', 'HI', 'LS'])
(['LS', 'OM', 'HI', 'IH'], ['HI', 'IH', 'OM', 'LS'])


In [17]:
for e in range(10):
    print(gat.mutation(['HI', 'IH', 'OM']))

['HI', 'OM', 'IH']
['IH', 'HI', 'OM']
['OM', 'IH', 'HI']
['OM', 'IH', 'HI']
['HI', 'OM', 'IH']
['HI', 'OM', 'IH']
['HI', 'OM', 'IH']
['IH', 'HI', 'OM']
['OM', 'IH', 'HI']
['HI', 'OM', 'IH']


In [18]:
# #preparing
#
# def remove_node_matrix(m, c_x, c_y):
#     for e in range(len(m)):
#         m[e].pop(y)
#     m.pop(x)
#
#
# initial_time = min([t for t in map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693'].keys()])
# code_map_list_sorted = [e[0] for e in sorted(map_attractions['1c84a229-8862-4648-9c71-378ddd2c7693'].items(), key=lambda al:al[1][1])]
#
# # Find initial node according to minimum cost
# x, y, _ = gat.find_minimum(map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693'][initial_time])
# force_initial_node = code_map_list_sorted[x]
#
# for time in map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693']:
#     map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693'][time] = remove_node_map(map_by_time['1c84a229-8862-4648-9c71-378ddd2c7693'][time], x, y)


In [19]:
# Finish session
spark.stop()

In [20]:
%pip list

Package                       Version
----------------------------- ------------
alembic                       1.12.0
altair                        5.1.2
anyio                         4.0.0
argon2-cffi                   23.1.0
argon2-cffi-bindings          21.2.0
arrow                         1.3.0
asttokens                     2.4.0
async-generator               1.10
async-lru                     2.0.4
attrs                         23.1.0
Babel                         2.13.0
backcall                      0.2.0
backports.functools-lru-cache 1.6.5
beautifulsoup4                4.12.2
bleach                        6.1.0
blinker                       1.6.3
bokeh                         3.3.0
boltons                       23.0.0
Bottleneck                    1.3.7
Brotli                        1.1.0
cached-property               1.5.2
certifi                       2023.7.22
certipy                       0.1.3
cffi                          1.16.0
charset-normalizer            3.3.0
click   

In [21]:
# df_dl = spark.read.orc('datalake_layer/epcot')

# # df_dl.printSchema()

In [22]:
# df_dl.createOrReplaceTempView('datalake_table')
# test_df = spark.sql("""
# SELECT 
#     extracted_date,
#     name AS attraction_name, 
#     AVG(queue.STANDBY.waitTime) AS avg_standby_waittime
# FROM datalake_table
# WHERE entity_type = 'ATTRACTION' AND queue.STANDBY.waitTime is not null
# GROUP BY 1, 2
# ORDER BY 1
# """)
# test_df.printSchema()
# test_df.write.jdbc(url=DB_URL, table="themeparkwizard.agg_avg_time_epcot", mode='overwrite', properties=PROPERTIES_CUSTOM)

In [23]:
# INTERVAL_OF_MINUTES = 60 * 15

In [24]:
# rest_df = spark.sql(f"""
# WITH wait_by_party AS (
#     SELECT
#         extracted_date,
#         date_format(cast(floor(try_divide(extracted_at, {INTERVAL_OF_MINUTES}))*{INTERVAL_OF_MINUTES} as timestamp), 'HH:mm') as time_of_the_day, --extracted_date,
#         name AS attraction_name,
#         CASE 
#         WHEN da_exp.partySize <= 2 THEN
#             'Small group (<= 2)'
#         WHEN da_exp.partySize > 2 AND da_exp.partySize <= 4 THEN
#             'Medium group (3 and 4)'
#         WHEN da_exp.partySize > 4 AND da_exp.partySize <= 6 THEN
#             'Medium group (5 ant 6)'
#         WHEN da_exp.partySize > 6 THEN
#             'Big group (> 6)'
#         END as party_size,
#         COALESCE(AVG(da_exp.waitTime), 0) as avg_wait_time,
#         STDDEV(da_exp.waitTime) AS stddev_wait_time
#     FROM datalake_table
#     LATERAL VIEW EXPLODE(diningAvailability) as da_exp
#     WHERE entity_type = 'SHOW' --AND name = 'Garden Grill Restaurant'
#     GROUP BY 1,2,3,4
# )
# SELECT 
#     *
# FROM wait_by_party
# ORDER BY 1, 2, 3, 4
# """).show(50, truncate=False)

In [25]:
# rest_df.printSchema()
# rest_df.write.jdbc(url=DB_URL, table="themeparkwizard.restaurant_wait_time_epcot", mode='overwrite', properties=PROPERTIES_CUSTOM)