# Organize - HOOK

## Prerequisites

In [None]:
!pip install -q sqlglot==27.20.0

In [None]:
import re

from datetime import datetime, timezone
from pyspark.sql import functions as F
from sqlglot import exp, parse_one
from tqdm.auto import tqdm

In [None]:
MANIFEST_PATH = "metadata.manifest"

FROM_SCHEMA = "raw"
FROM_PREFIX = "raw"

TO_SCHEMA = "hook"
TO_PREFIX = "frame"

## Helper Functions

In [None]:
def load_manifest(table_path: str):
    manifest = (
        spark.read.table(table_path)
        .filter(F.col("generate") == True)
        .collect()
    )

    return manifest

In [None]:
def extract_active_select(mlv_name: str) -> exp.Expression | None:
    try:
        statement =  spark.sql(f"SHOW CREATE MATERIALIZED LAKE VIEW {mlv_name};").collect()[0][0]
    except:
        return None

    match = re.search(r"AS\s*\((.*)\)\s*$", statement, flags=re.DOTALL)

    if not match:
        return None

    select = match.group(1).strip()
    result = parse_one(select, dialect="spark")

    return result

In [None]:
def manage_mlv(
    select_statement: exp.Expression,
    MLV_Identifier: str
) -> None:
    active_select = extract_active_select(MLV_Identifier)

    # We need to parse the selects in order to compare correctly
    convert_to_spark_sql = lambda x: x.sql(
        dialect="spark",
        identify=True,
        pretty=True,
    )
    spark_sql = convert_to_spark_sql(select_statement)
    active_spark_sql = convert_to_spark_sql(active_select) if active_select else None
    is_unchanged = spark_sql == active_spark_sql

    # If the select is unchanged, we refresh the MLV
    if is_unchanged:
        tqdm.write(f"{MLV_Identifier}: No changes detected, refreshing...")
        _ = spark.sql(f"REFRESH MATERIALIZED LAKE VIEW {MLV_Identifier };")
        
        return None
    
    # If there is an active select, we need to backup the view before creation
    if active_select:
        tqdm.write(f"{MLV_Identifier}: Changes detected, recreating MLV...")
        utc_ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
        _ = spark.sql(f"ALTER MATERIALIZED LAKE VIEW {MLV_Identifier} RENAME TO {MLV_Identifier}__{utc_ts};")

    else:
        tqdm.write(f"{MLV_Identifier}: Creating MLV...")

    _ = spark.sql(f"CREATE MATERIALIZED LAKE VIEW {MLV_Identifier} AS ({spark_sql});")
    
    return None

## Blueprint

In [None]:
def generate_hook_expression(hook_dict: dict) -> exp.Expression:
    name = hook_dict["name"]
    keyset = hook_dict["keyset"]
    business_key_field = hook_dict["business_key_field"]

    key_lit = exp.Literal.string(f"{keyset}|")
    column = parse_one(business_key_field)

    cast = exp.Cast(this=column, to=exp.DataType.build("STRING"))
    trim = exp.Trim(this=cast)
    val = exp.Concat(
        expressions=[key_lit, trim]
    )

    condition = exp.Is(this=column, expression=exp.Null()).not_()
    expr = exp.Case().when(condition, val).as_(name)

    return expr.sql()

In [None]:
def generate_frame_sql(
    frame_hooks: list,
    from_table: str
) -> exp.Expression:
    
    hook_expressions = [generate_hook_expression(hook) for hook in frame_hooks]
    expression = exp.select(*hook_expressions, exp.Star()).from_(from_table)

    return expression

In [None]:
def generate_hook_frames(
    manifest_path: str,
    from_schema: str = "raw",
    from_prefix: str = "raw",
    to_schema: str = "hook",
    to_prefix: str = "frame"
) -> None:

    _ = spark.sql(f"CREATE SCHEMA IF NOT EXISTS {to_schema};")

    frames_to_generate = load_manifest(manifest_path)

    for frame in tqdm(
        frames_to_generate,
        desc="Generating Frame MLVs"
    ):

        frame_name = frame["name"]
        frame_source = frame["source"]
        frame_hooks = frame["hooks"]

        from_table = f"{from_schema}.{from_prefix}__{frame_source}__{frame_name}"
        to_table = f"{to_schema}.{to_prefix}__{frame_source}__{frame_name}"
        select_statement = generate_frame_sql(frame_hooks, from_table)

        _ = manage_mlv(
                select_statement=select_statement,
                MLV_Identifier=to_table
            )
        
    return None

generate_hook_frames(MANIFEST_PATH, FROM_SCHEMA, FROM_PREFIX, TO_SCHEMA, TO_PREFIX)