# DATALAKE staging
This script is responsible to read and clean data from Raw folder and write the result to datalake folder.

In [1]:
# SET CONSTANTS
DEBUG = True
DATALAKE_CONFIG = 'general_schemas_tables/config_datalake.json'

In [2]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
import requests
import json
from collections import defaultdict
import locale
import re
from glob import glob
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, from_unixtime, col, to_date, sum, avg, max
from pyspark.sql.types import DateType, TimestampType, StructType, StructField, FloatType, IntegerType

# Instantiate sparkSession
spark = SparkSession.builder \
    .appName("GenerateDatalake") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

# Read raw schema and deserialize it
with open('general_schemas_tables/raw_schema.json', 'r') as f:
    raw_schema_obj = StructType.fromJson(json.load(f))

In [4]:
def etl_park_data(df):
    SELECTED_FEATURES = [
        col('name').alias('park_name'),
        'extracted_at',
        col('extracted_at').cast(TimestampType()).alias('extracted_at_time'),
        to_date('extracted_at_time').alias('extracted_date'),
        explode('liveData').alias('live_data')
    ]
    if DEBUG:
        pre_dup = df.count()
    df = df.drop_duplicates()
    if DEBUG:
        pos_dup = df.count()
        print("Removed duplicates!", f"Count rows before: {pre_dup}", f"Count rows after: {pos_dup}", sep = '\n')
    df_features = df.select(*SELECTED_FEATURES)
    df_features = df_features.select(*[c+'.*' if c == 'live_data' else c for c in df_features.columns])\
                            .withColumnRenamed('entityType', 'entity_type')\
                            .where("entity_type != 'PARK' AND extracted_date IS NOT NULL")
    if DEBUG:
        print("Expanded live data!", f"Count rows now: {df_features.count()}", "Sample of data:", sep = '\n')
        df_features.show(1, truncate=False, vertical=True)
    return df_features


def write_datalake(df, park_name):
    path = f'./datalake_layer/{park_name}'
    print(f'Writing DF to {path}')
    df.write\
        .mode('overwrite')\
        .partitionBy('entity_type', 'extracted_date')\
        .orc(path)

In [5]:
if os.path.exists(DATALAKE_CONFIG):
    # process and save
    with open(DATALAKE_CONFIG, 'r') as f:
        _config_dl = json.load(f)
else:
    _config_dl = dict()
        

for i, park in enumerate(glob('./raw_layer/*')):
    park_name = park.rpartition('/')[-1]
    park_name = re.sub(r'[_]*(park|theme)[_]*', '', park_name)
    park_name_harmonized = ' '.join([word.capitalize() for word in park_name.split('_')])
    print(park)
    print(f"Processing park '{park_name_harmonized}', and generating their respective datalake info!")

    
    df = spark.read.schema(raw_schema_obj).json(park)
    if _config_dl.get('last_run_date'):
        rule = f"date >= '{_config_dl.get('last_run_date')}'"
        print(f'Processing raw according to rule: {rule}')
        df = df.where(rule)
    else:
        print(f'No rules attached, processing whole raw layer, this might take a while!')
    df.cache()
    # TODO filter only new partitions

    df_transformed = etl_park_data(df)
    df_transformed = df_transformed.drop('date')
    write_datalake(df_transformed, park_name)

    if len(glob('./raw_layer/*')) == i - 1 and not df_transformed.isEmpty():
        last_run_date = df_transformed.select(max('extracted_date').alias('last_date')).collect()[0][0].strftime('%Y-%m-%d')
        print(last_run_date)
        _config_dl['last_run_date'] = last_run_date
        with open(DATALAKE_CONFIG, 'w') as f:
            json.dump(_config_dl, f)
            
    df.unpersist()

./raw_layer/park_animal_kingdom_theme_park
Processing park 'Animal Kingdom', and generating their respective datalake info!
Processing raw according to rule: date >= '2025-08-14'
Removed duplicates!
Count rows before: 3555
Count rows after: 3555
Expanded live data!
Count rows now: 140843
Sample of data:
-RECORD 0--------------------------------------------------
 park_name          | Disney's Animal Kingdom Theme Park   
 extracted_at       | 1755950779                           
 extracted_at_time  | 2025-08-23 12:06:19                  
 extracted_date     | 2025-08-23                           
 diningAvailability | NULL                                 
 entity_type        | ATTRACTION                           
 forecast           | NULL                                 
 id                 | bc1ffa86-9b1a-4ce9-84a5-b479dfa3cb53 
 lastUpdated        | 2022-09-30T04:06:02Z                 
 name               | Tree of Life                         
 operatingHours     | NULL         

In [6]:
# Finish session
spark.stop()