# Transformation metrics
This script is responsible create all metrics necessary and upload them to PostgreSQL

In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [11]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import explode, from_unixtime, col, to_date, sum, avg
from pyspark.sql.types import DateType, TimestampType, StructType
from prophet import Prophet

import requests
import json
from collections import defaultdict
import locale
import os

DB_URL = "jdbc:postgresql://postgres:5432/themeparkwizard"
PROPERTIES_CUSTOM = {"user": os.environ['POSTGRES_USER'],"password": os.environ['POSTGRES_PASSWORD'], "driver": "org.postgresql.Driver"}

spark = SparkSession.builder \
    .appName("MetricBuilder") \
    .config("spark.jars", "jars/postgresql-42.7.7.jar") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

Importing plotly failed. Interactive plots will not work.


In [None]:
df_dl = spark.read.orc('datalake_layer/epcot')

# df_dl.printSchema()

In [5]:
df_dl.createOrReplaceTempView('datalake_table')
test_df = spark.sql("""
SELECT 
    extracted_date,
    name AS attraction_name, 
    AVG(queue.STANDBY.waitTime) AS avg_standby_waittime
FROM datalake_table
WHERE entity_type = 'ATTRACTION' AND queue.STANDBY.waitTime is not null
GROUP BY 1, 2
ORDER BY 1
""")
test_df.printSchema()
test_df.write.jdbc(url=DB_URL, table="themeparkwizard.agg_avg_time_epcot", mode='overwrite', properties=PROPERTIES_CUSTOM)

root
 |-- extracted_date: date (nullable = true)
 |-- attraction_name: string (nullable = true)
 |-- avg_standby_waittime: double (nullable = true)



In [9]:
df_dl_to_model = df_dl.where("name == 'Guardians of the Galaxy: Cosmic Rewind' AND queue.STANDBY.waitTime is not null")\
                        .select('extracted_at_time', 'queue.STANDBY.waitTime')\
                        .orderBy('extracted_at_time')\
                        .withColumnRenamed('extracted_at_time', 'ds')\
                        .withColumnRenamed('queue.STANDBY.waitTime', 'y')
df_dl_to_model.show()

+-------------------+--------+
|                 ds|waitTime|
+-------------------+--------+
|2025-07-25 12:35:04|      30|
|2025-07-25 12:40:04|      30|
|2025-07-25 12:45:06|      30|
|2025-07-25 12:50:05|      45|
|2025-07-25 12:55:08|      45|
|2025-07-25 13:00:05|      45|
|2025-07-25 13:05:05|      60|
|2025-07-25 13:10:05|      70|
|2025-07-25 13:15:05|      80|
|2025-07-25 13:20:04|      80|
|2025-07-25 13:25:05|      80|
|2025-07-25 13:30:04|      80|
|2025-07-25 13:35:05|      80|
|2025-07-25 13:40:05|      80|
|2025-07-25 13:45:04|      70|
|2025-07-25 13:50:04|      70|
|2025-07-25 13:55:04|      70|
|2025-07-25 14:00:06|      70|
|2025-07-25 14:05:04|      70|
|2025-07-25 14:10:05|      70|
+-------------------+--------+
only showing top 20 rows



In [12]:
model = Prophet()
model.fit(df_dl_to_model)

PySparkValueError: [CANNOT_CONVERT_COLUMN_INTO_BOOL] Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

In [None]:
model.predict()

In [44]:
INTERVAL_OF_MINUTES = 60 * 15

In [48]:
rest_df = spark.sql(f"""
WITH wait_by_party AS (
    SELECT
        extracted_date,
        date_format(cast(floor(try_divide(extracted_at, {INTERVAL_OF_MINUTES}))*{INTERVAL_OF_MINUTES} as timestamp), 'HH:mm') as time_of_the_day, --extracted_date,
        name AS attraction_name,
        CASE 
        WHEN da_exp.partySize <= 2 THEN
            'Small group (<= 2)'
        WHEN da_exp.partySize > 2 AND da_exp.partySize <= 4 THEN
            'Medium group (3 and 4)'
        WHEN da_exp.partySize > 4 AND da_exp.partySize <= 6 THEN
            'Medium group (5 ant 6)'
        WHEN da_exp.partySize > 6 THEN
            'Big group (> 6)'
        END as party_size,
        COALESCE(AVG(da_exp.waitTime), 0) as avg_wait_time,
        STDDEV(da_exp.waitTime) AS stddev_wait_time
    FROM datalake_table
    LATERAL VIEW EXPLODE(diningAvailability) as da_exp
    WHERE entity_type = 'SHOW' --AND name = 'Garden Grill Restaurant'
    GROUP BY 1,2,3,4
)
SELECT 
    *
FROM wait_by_party
ORDER BY 1, 2, 3, 4
""").show(50, truncate=False)

+--------------+---------------+---------------+----------+-------------+----------------+
|extracted_date|time_of_the_day|attraction_name|party_size|avg_wait_time|stddev_wait_time|
+--------------+---------------+---------------+----------+-------------+----------------+
+--------------+---------------+---------------+----------+-------------+----------------+



In [46]:
rest_df.printSchema()
rest_df.write.jdbc(url=DB_URL, table="themeparkwizard.restaurant_wait_time_epcot", mode='overwrite', properties=PROPERTIES_CUSTOM)

root
 |-- extracted_date: date (nullable = true)
 |-- time_of_the_day: string (nullable = true)
 |-- attraction_name: string (nullable = true)
 |-- party_size: string (nullable = true)
 |-- avg_wait_time: double (nullable = false)
 |-- stddev_wait_time: double (nullable = true)

