# Transformation metrics
This script is responsible create all metrics necessary and upload them to PostgreSQL

In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession, Window
from pyspark.conf import SparkConf
from pyspark.sql.functions import explode, from_unixtime, col, to_date, sum, avg, udf
from pyspark.sql.types import DateType, TimestampType, StructType, DoubleType, StructField, StringType, DayTimeIntervalType, IntegerType, LongType
from prophet import Prophet

from glob import glob
import requests
import json
from collections import defaultdict
import locale
import os

DB_URL = "jdbc:postgresql://postgres:5432/themeparkwizard"
PROPERTIES_CUSTOM = {"user": os.environ['POSTGRES_USER'],"password": os.environ['POSTGRES_PASSWORD'], "driver": "org.postgresql.Driver"}

spark = SparkSession.builder \
    .appName("MetricBuilder") \
    .config("spark.jars", "jars/postgresql-42.7.7.jar") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

Importing plotly failed. Interactive plots will not work.


In [3]:
@udf(returnType=TimestampType())
def min_hour(data):
    if isinstance(data, list):
        for ee in data:
            if ee.type == 'Early Entry':
                return ee.startTime

@udf(returnType=TimestampType())
def max_hour(data):
    if isinstance(data, list):
        for op in data:
            if op.type == 'Operating':
                return op.endTime

def save_into_postgres(df, table, mode):
    df.write.jdbc(url=DB_URL, table=table, mode=mode, properties=PROPERTIES_CUSTOM)

def agg_avg_time_compute():
    print('Computing agg_avg_time...')
    result_avg = spark.sql("""
    SELECT 
        extracted_date,
        id as entity_id, 
        AVG(queue.STANDBY.waitTime) AS avg_standby_waittime
    FROM datalake_table
    WHERE entity_type = 'ATTRACTION' AND queue.STANDBY.waitTime is not null
    GROUP BY 1, 2
    ORDER BY 1
    """)
    result_avg.printSchema()
    save_into_postgres(result_avg, "themeparkwizard.agg_avg_time", 'append')
    # result_avg.write.jdbc(url=DB_URL, table="themeparkwizard.agg_avg_time", mode='append', properties=PROPERTIES_CUSTOM)

def operating_ratio_compute(): 
    print('Computing operating...')
    result_ratio = spark.sql("""
    WITH table_status AS (
        SELECT
            extracted_at_time,
            lead(extracted_at_time, 1) OVER (PARTITION BY id ORDER BY extracted_at_time) as next_time,
            id as entity_id,
            status,
            lead(status, 1) OVER (PARTITION BY id ORDER BY extracted_at_time) as next_status
        FROM datalake_table
        WHERE entity_type = 'ATTRACTION'
        ORDER BY 1,3
        ), interval_by_status AS (
        SELECT 
            entity_id,
            (unix_timestamp(next_time) - unix_timestamp(extracted_at_time)) as time_passed,
            CASE
            WHEN status = 'OPERATING' THEN 1
            ELSE 0
            END as whole_status
        FROM table_status
        )
        SELECT 
            entity_id,
            sum(time_passed) as time_by_status,
            whole_status
        FROM interval_by_status
        GROUP BY 1,3
    """)
    result_ratio.printSchema()
    save_into_postgres(result_ratio, "themeparkwizard.operating_ratio", 'append')

In [4]:
schema = StructType([
    StructField("extracted_date", DateType(), False),
    StructField("entity_id", StringType(), False),
    StructField("avg_standby_waittime", DoubleType(), False),
])
save_into_postgres(spark.createDataFrame([], schema), "themeparkwizard.agg_avg_time", 'overwrite')

schema = StructType([
    StructField("entity_id", StringType(), False),
    StructField("time_by_status", LongType(), False),
    StructField("whole_status", IntegerType(), False),
])
save_into_postgres(spark.createDataFrame([], schema), "themeparkwizard.operating_ratio", 'overwrite')

In [5]:
# Load dim_park_entity
df_parks = spark.read.json('general_schemas_tables/park_by_entity_meta_new.json')
df_parks.write.jdbc(url=DB_URL, table=f"themeparkwizard.dim_park_entity", mode='overwrite', properties=PROPERTIES_CUSTOM)

In [None]:

for path in glob('datalake_layer/*'):
    print(f'Transforming {path} ...')
    df_dl = spark.read.orc(path).cache()
    # df_dl_working_hour = df_dl.withColumn('start_time', min_hour(col('operatingHours')))\
    #                 .withColumn('end_time', max_hour(col('operatingHours')))\
    #                 .filter(col('extracted_at_time').between(col('start_time'), col('end_time')))
    # df_dl_working_hour.createOrReplaceTempView('datalake_table_working_hour')
    df_dl.createOrReplaceTempView('datalake_table')
    agg_avg_time_compute()
    operating_ratio_compute()
# df_dl.printSchema()

Transforming datalake_layer/animal_kingdom ...
Computing agg_avg_time...
root
 |-- extracted_date: date (nullable = true)
 |-- entity_id: string (nullable = true)
 |-- avg_standby_waittime: double (nullable = true)

Computing operating...
root
 |-- entity_id: string (nullable = true)
 |-- time_by_status: long (nullable = true)
 |-- whole_status: integer (nullable = false)

Transforming datalake_layer/epcot ...
Computing agg_avg_time...
root
 |-- extracted_date: date (nullable = true)
 |-- entity_id: string (nullable = true)
 |-- avg_standby_waittime: double (nullable = true)

Computing operating...
root
 |-- entity_id: string (nullable = true)
 |-- time_by_status: long (nullable = true)
 |-- whole_status: integer (nullable = false)

Transforming datalake_layer/hollywood_studios ...


In [None]:
spark.sql("""
    WITH table_status AS (
        SELECT
            extracted_at_time,
            lead(extracted_at_time, 1) OVER (PARTITION BY id ORDER BY extracted_at_time) as next_time,
            id as entity_id,
            status,
            lead(status, 1) OVER (PARTITION BY id ORDER BY extracted_at_time) as next_status
        FROM datalake_table
        WHERE entity_type = 'ATTRACTION'
        ORDER BY 1,3
        ), interval_by_status AS (
        SELECT 
            entity_id,
            (unix_timestamp(next_time) - unix_timestamp(extracted_at_time)) as time_passed,
            CASE
            WHEN status = 'OPERATING' THEN 1
            ELSE 0
            END as whole_status
        FROM table_status
        )
        SELECT 
            entity_id,
            sum(time_passed) as time_by_status,
            whole_status
        FROM interval_by_status
        GROUP BY 1,3
    """).show(10)

In [None]:
# Finish session
spark.stop()