# Predict metrics
This script is responsible create predict metrics

In [72]:
# SET CONSTANTS
BY_MINUTES = 60*10
DAYS_TO_PREDICT = 2

In [2]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [49]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import explode, from_unixtime, col, to_date, sum, avg, udf, lit, date_trunc, when
from pyspark.sql.types import DateType, TimestampType, StructType, StructField, IntegerType
from prophet import Prophet
from prophet.serialize import model_to_json, model_from_json

import requests
import json
from collections import defaultdict
import locale
import os
import re
from glob import glob
from datetime import datetime, timedelta, date, tzinfo, timezone

DB_URL = "jdbc:postgresql://postgres:5432/themeparkwizard"
PROPERTIES_CUSTOM = {"user": os.environ['POSTGRES_USER'],"password": os.environ['POSTGRES_PASSWORD'], "driver": "org.postgresql.Driver"}

spark = SparkSession.builder \
    .appName("MetricPredict") \
    .config("spark.jars", "jars/postgresql-42.7.7.jar") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

In [73]:
@udf(returnType=TimestampType())
def min_hour(data):
    if isinstance(data, list):
        for ee in data:
            if ee.type == 'Early Entry':
                return ee.startTime

@udf(returnType=TimestampType())
def max_hour(data):
    if isinstance(data, list):
        for op in data:
            if op.type == 'Operating':
                return op.endTime

def save_prophet_model(model, path):
    with open(path+'.json', 'w') as fout:
        fout.write(model_to_json(model))  # Save model
        
def load_prophet_model(path):
    with open(path+'.json', 'r') as fin:
        return model_from_json(fin.read())  # Load model

def warm_start_params(m):
    """
    Retrieve parameters from a trained model in the format used to initialize a new Stan model.
    Note that the new Stan model must have these same settings:
        n_changepoints, seasonality features, mcmc sampling
    for the retrieved parameters to be valid for the new model.

    Parameters
    ----------
    m: A trained model of the Prophet class.

    Returns
    -------
    A Dictionary containing retrieved parameters of m.
    """
    res = {}
    for pname in ['k', 'm', 'sigma_obs']:
        if m.mcmc_samples == 0:
            res[pname] = m.params[pname][0][0]
        else:
            res[pname] = np.mean(m.params[pname])
    for pname in ['delta', 'beta']:
        if m.mcmc_samples == 0:
            res[pname] = m.params[pname][0]
        else:
            res[pname] = np.mean(m.params[pname], axis=0)
    return res

In [106]:
tz = timezone(timedelta(hours=-3))
now = datetime.now(tz)
start_predict = (now + timedelta(days=-2)).date()

# limits = df_dl.where("name == 'Guardians of the Galaxy: Cosmic Rewind'")\
#     .select(min_hour(col('operatingHours')).alias('start_time'), max_hour(col('operatingHours')).alias('end_time'))\
#     .limit(1)\
#     .collect()[0]
# limits
# diff_seconds = (limits.end_time - limits.start_time).seconds
# new_day = datetime(start_predict.year, start_predict.month, start_predict.day, limits.start_time.hour, limits.start_time.minute)

new_day = datetime(start_predict.year, start_predict.month, start_predict.day, 0, 0)
# new_day
# data = [(new_day + timedelta(seconds=interval),5,400) for interval in range(0, (limits.end_time - limits.start_time).seconds, BY_MINUTES)]
data = [(new_day + timedelta(seconds=interval),5,400) for interval in range(0, 60*60*24*DAYS_TO_PREDICT, BY_MINUTES)]
schema = StructType([
    StructField("ds", TimestampType(), False),
    StructField("floor", IntegerType(), False),
    StructField("cap", IntegerType(), False),
])
period_to_predict = spark.createDataFrame(data, schema).cache()

In [111]:
period_to_predict.show(10)

+-------------------+-----+---+
|                 ds|floor|cap|
+-------------------+-----+---+
|2025-08-14 00:00:00|    5|400|
|2025-08-14 00:10:00|    5|400|
|2025-08-14 00:20:00|    5|400|
|2025-08-14 00:30:00|    5|400|
|2025-08-14 00:40:00|    5|400|
|2025-08-14 00:50:00|    5|400|
|2025-08-14 01:00:00|    5|400|
|2025-08-14 01:10:00|    5|400|
|2025-08-14 01:20:00|    5|400|
|2025-08-14 01:30:00|    5|400|
+-------------------+-----+---+
only showing top 10 rows



In [113]:
for path in glob('datalake_layer/*'):
    path_to_read = path+'/entity_type=attraction/'
    print(path_to_read)
    df = spark.read.orc(path_to_read).cache()

    attractions = [r.name for r in df.select('name').distinct().collect()]
    for att in attractions:
        # df.show(5)
        print(f'Running for {att}')
        df_attraction = df.where(f"name == '{att}' AND queue.STANDBY.waitTime is not null")\
                            .withColumn('start_time', min_hour(col('operatingHours')))\
                            .withColumn('end_time', max_hour(col('operatingHours')))\
                            .filter(col('extracted_at_time').between(col('start_time'), col('end_time'))).cache()
        if df_attraction.isEmpty():
            print(f'No data for {att}!')
            continue

        # Get inferior and superir limits for this attraction
        limits = df_attraction.select(min_hour(col('operatingHours')).alias('start_time'), max_hour(col('operatingHours')).alias('end_time'))\
                                .limit(1)\
                                .collect()[0]

        print(limits)
        df_attraction = df_attraction.select('extracted_at_time', 'queue.STANDBY.waitTime', 'extracted_at')
        df_attraction.createOrReplaceTempView('current_table')
        df_attraction = spark.sql("""
        WITH enrich_data AS (
            SELECT
                cast(floor(try_divide(extracted_at, 600))*600 as timestamp) as extracted_at_time,
                AVG(waitTime) as waitTime
            FROM current_table
            GROUP BY 1
            UNION ALL
            SELECT
                extracted_at_time,
                waitTime
            FROM current_table
        )
        SELECT 
            ed.*,
            year(extracted_at_time) as year,
            month(extracted_at_time) as month,
            dayofweek(extracted_at_time) as dayofweek,
            hour(extracted_at_time) as hour,
            minute(extracted_at_time) as minute,
            5 as floor,
            400 as cap
        FROM enrich_data ed
        ORDER BY ed.extracted_at_time DESC
        """)

        df_attraction = df_attraction.drop('extracted_at')\
                                    .withColumnRenamed('extracted_at_time', 'ds')\
                                    .withColumnRenamed('waitTime', 'y')
        df_attraction.createOrReplaceTempView('current_table')
        # df_attraction.orderBy('ds', ascending=False).show(25)

        # Build model
        stringify_att = re.sub(r'[ -]*', '', att)
        model_filename = path.rpartition('/')[-1] + '_' + stringify_att
        model_path = 'models/'+model_filename
        # newest_model = sorted(glob(model_path+'*'))[-1]
        if os.path.exists(model_path):
            model = load_prophet_model(model_path)
            model = Prophet().fit(df_attraction.toPandas(), init=warm_start_params(model))
        else:
            model = Prophet(growth='logistic')#Prophet(holidays=hus_df.toPandas(), growth='flat')
            model.add_country_holidays(country_name='US')
            model.fit(df_attraction.toPandas())
            save_prophet_model(model, model_path)

        st = datetime(start_predict.year, start_predict.month, start_predict.day, limits.start_time.hour, limits.start_time.minute)
        et = datetime(start_predict.year, start_predict.month, start_predict.day, limits.end_time.hour, limits.end_time.minute)
        filter_list_str = []
        for i in range(DAYS_TO_PREDICT):
            print(st, et)
            scol = st.strftime("%Y-%m-%d %H:%M:%S")
            ecol = et.strftime("%Y-%m-%d %H:%M:%S")
            filter_list_str.append(f"ds BETWEEN '{scol}' AND '{ecol}'")
            # period_to_predict = period_to_predict.withColumn(scol, lit(st).cast(TimestampType()))\
            #                                     .withColumn(ecol, lit(et).cast(TimestampType()))\
                                                
            st += timedelta(days=1)
            et += timedelta(days=1)
        period_to_predict = period_to_predict.where(' OR '.join(filter_list_str))

        predicted_df = spark.createDataFrame(model.predict(period_to_predict.toPandas()))
        predicted_df.show(20)
        predicted_df.createOrReplaceTempView('predicted_table')
        res = spark.sql("""
        SELECT
            ds as extracted_at_time,
            y as waitTime,
            0 as was_predicted
        FROM current_table
        UNION ALL
        SELECT
            ds as extracted_at_time,
            y as waitTime,
            1 as was_predicted
        FROM (SELECT ds, y, ROW_NUMBER() OVER (ORDER BY ds DESC) as rn FROM current_table)
        WHERE rn = 1
        UNION ALL
        SELECT
            ds as extracted_at_time,
            yhat as waitTime,
            1 as was_predicted
        FROM predicted_table
        """).withColumn('attraction_name', lit(att))
        res.write.jdbc(url=DB_URL, table=f"themeparkwizard.predictions_table", mode='overwrite', properties=PROPERTIES_CUSTOM)
        # res.select('ds', lit(1).alias('was_predicted'), 'yhat').show(60)
        break
        
        df_attraction.unpersist()
    break

datalake_layer/animal_kingdom/entity_type=attraction/
Running for Discovery Island Trails
No data for Discovery Island Trails!
Running for Wildlife Express Train
No data for Wildlife Express Train!
Running for Kilimanjaro Safaris
No data for Kilimanjaro Safaris!
Running for Tree of Life
No data for Tree of Life!
Running for Dino-Sue
No data for Dino-Sue!
Running for Wilderness Explorers
No data for Wilderness Explorers!
Running for The Animation Experience at Conservation Station
No data for The Animation Experience at Conservation Station!
Running for Avatar Flight of Passage
Row(start_time=datetime.datetime(2025, 7, 31, 11, 30), end_time=datetime.datetime(2025, 7, 31, 22, 0))


07:55:36 - cmdstanpy - INFO - Chain [1] start processing
07:55:38 - cmdstanpy - INFO - Chain [1] done processing


2025-08-14 11:30:00 2025-08-14 22:00:00
2025-08-15 11:30:00 2025-08-15 22:00:00
+-------------------+------------------+---+-----+------------------+-----------------+------------------+------------------+-------------+-------------------+-------------------+------------+------------------+------------------+----------------+----------------------+----------------------+------------------------------------+------------------------------------------+------------------------------------------+---------+---------------+---------------+--------------------------+--------------------------------+--------------------------------+------------+------------------+------------------+--------------+--------------------+--------------------+----------------+----------------------+----------------------+------------+------------------+------------------+---------------------+---------------------------+---------------------------+-------------------+--------------------+--------------------+-------

In [None]:
df_dl = spark.read.orc('datalake_layer/epcot')

# df_dl.printSchema()

In [5]:
df_dl.createOrReplaceTempView('datalake_table')
test_df = spark.sql("""
SELECT 
    extracted_date,
    name AS attraction_name, 
    AVG(queue.STANDBY.waitTime) AS avg_standby_waittime
FROM datalake_table
WHERE entity_type = 'ATTRACTION' AND queue.STANDBY.waitTime is not null
GROUP BY 1, 2
ORDER BY 1
""")
test_df.printSchema()
test_df.write.jdbc(url=DB_URL, table="themeparkwizard.agg_avg_time_epcot", mode='overwrite', properties=PROPERTIES_CUSTOM)

root
 |-- extracted_date: date (nullable = true)
 |-- attraction_name: string (nullable = true)
 |-- avg_standby_waittime: double (nullable = true)



In [44]:
INTERVAL_OF_MINUTES = 60 * 15

In [48]:
rest_df = spark.sql(f"""
WITH wait_by_party AS (
    SELECT
        extracted_date,
        date_format(cast(floor(try_divide(extracted_at, {INTERVAL_OF_MINUTES}))*{INTERVAL_OF_MINUTES} as timestamp), 'HH:mm') as time_of_the_day, --extracted_date,
        name AS attraction_name,
        CASE 
        WHEN da_exp.partySize <= 2 THEN
            'Small group (<= 2)'
        WHEN da_exp.partySize > 2 AND da_exp.partySize <= 4 THEN
            'Medium group (3 and 4)'
        WHEN da_exp.partySize > 4 AND da_exp.partySize <= 6 THEN
            'Medium group (5 ant 6)'
        WHEN da_exp.partySize > 6 THEN
            'Big group (> 6)'
        END as party_size,
        COALESCE(AVG(da_exp.waitTime), 0) as avg_wait_time,
        STDDEV(da_exp.waitTime) AS stddev_wait_time
    FROM datalake_table
    LATERAL VIEW EXPLODE(diningAvailability) as da_exp
    WHERE entity_type = 'SHOW' --AND name = 'Garden Grill Restaurant'
    GROUP BY 1,2,3,4
)
SELECT 
    *
FROM wait_by_party
ORDER BY 1, 2, 3, 4
""").show(50, truncate=False)

+--------------+---------------+---------------+----------+-------------+----------------+
|extracted_date|time_of_the_day|attraction_name|party_size|avg_wait_time|stddev_wait_time|
+--------------+---------------+---------------+----------+-------------+----------------+
+--------------+---------------+---------------+----------+-------------+----------------+



In [46]:
rest_df.printSchema()
rest_df.write.jdbc(url=DB_URL, table="themeparkwizard.restaurant_wait_time_epcot", mode='overwrite', properties=PROPERTIES_CUSTOM)

root
 |-- extracted_date: date (nullable = true)
 |-- time_of_the_day: string (nullable = true)
 |-- attraction_name: string (nullable = true)
 |-- party_size: string (nullable = true)
 |-- avg_wait_time: double (nullable = false)
 |-- stddev_wait_time: double (nullable = true)

