# DATALAKE staging
This script is responsible to read and clean data from Raw folder and write the result to datalake folder.

In [1]:
# SET CONSTANTS
DEBUG = False

In [2]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
import requests
import json
from collections import defaultdict
import locale
import re
from glob import glob

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, from_unixtime, col, to_date, sum, avg
from pyspark.sql.types import DateType, TimestampType, StructType

# Instantiate sparkSession
spark = SparkSession.builder \
    .appName("GenerateDatalake") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

# Read raw schema and deserialize it
with open('raw_schema.json', 'r') as f:
    raw_schema_obj = StructType.fromJson(json.load(f))

In [4]:
def etl_park_data(df):
    SELECTED_FEATURES = [
        col('name').alias('park_name'),
        'extracted_at',
        col('extracted_at').cast(TimestampType()).alias('extracted_at_time'),
        to_date('extracted_at_time').alias('extracted_date'),
        explode('liveData').alias('live_data')
    ]
    if DEBUG:
        pre_dup = df.count()
    df = df.drop_duplicates()
    if DEBUG:
        pos_dup = df.count()
        print("Removed duplicates!", f"Count rows before: {pre_dup}", f"Count rows after: {pos_dup}", sep = '\n')
    df_features = df.select(*SELECTED_FEATURES)
    df_features = df_features.select(*[c+'.*' if c == 'live_data' else c for c in df_features.columns])\
                            .withColumnRenamed('entityType', 'entity_type')\
                            .where("entity_type != 'PARK' AND extracted_date IS NOT NULL").cache()
    if DEBUG:
        print("Expanded live data!", f"Count rows now: {df_features.count()}", "Sample of data:", sep = '\n')
        df_features.show(1, truncate=False, vertical=True)
    return df_features


def write_datalake(df, park_name):
    path = f'./datalake_layer/{park_name}'
    print(f'Writing DF to {path}')
    df.write\
        .mode('overwrite')\
        .partitionBy('entity_type', 'extracted_date')\
        .orc(path)

In [5]:
for park in glob('./raw_layer/*'):
    park_name = park.rpartition('/')[-1]
    park_name = re.sub(r'[_]*(park|theme)[_]*', '', park_name)
    park_name_harmonized = ' '.join([word.capitalize() for word in park_name.split('_')])
    print(park)
    print(f"Processing park '{park_name_harmonized}', and generating their respective datalake, this might take a while!")

    
    df = spark.read.schema(raw_schema_obj).json(park+'/**')
    # TODO filter only new partitions

    df_transformed = etl_park_data(df)
    write_datalake(df_transformed, park_name)
    
    df.unpersist()

./raw_layer/park_animal_kingdom_theme_park
Processing park 'Animal Kingdom', and generating their respective datalake, this might take a while!
Writing DF to ./datalake_layer/animal_kingdom
./raw_layer/park_epcot
Processing park 'Epcot', and generating their respective datalake, this might take a while!
Writing DF to ./datalake_layer/epcot
./raw_layer/park_hollywood_studios
Processing park 'Hollywood Studios', and generating their respective datalake, this might take a while!
Writing DF to ./datalake_layer/hollywood_studios
./raw_layer/park_magic_kingdom_park
Processing park 'Magic Kingdom', and generating their respective datalake, this might take a while!
Writing DF to ./datalake_layer/magic_kingdom
./raw_layer/park_universal_epic_universe
Processing park 'Universal Epic Universe', and generating their respective datalake, this might take a while!
Writing DF to ./datalake_layer/universal_epic_universe
./raw_layer/park_universal_islands_of_adventure
Processing park 'Universal Islands 

In [7]:
# Finish session
spark.stop()