In [0]:
%run "../../../config/setup"

In [0]:
%python

%pip install user_agents

In [0]:
%python

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import user_agents

# Define the UDF to parse user_agent and extract device_type
def parse_user_agent(user_agent):
    if user_agent:
        ua = user_agents.parse(user_agent)
        return ua.browser.family  # Returns device type (e.g., "iPhone", "Mac", "Spider")
    return None

# Register the UDF
parse_user_agent_browser_udf = udf(parse_user_agent, StringType())

spark.udf.register("parse_user_agent_browser_sql", parse_user_agent_browser_udf)

In [0]:
CREATE OR REPLACE TABLE customers_clickstream_behavior_gold AS
WITH session_metrics AS (
  SELECT
    customer_id,
    session_id,
    COUNT(*) AS page_views,
    FIRST_VALUE(page_url) OVER (PARTITION BY session_id ORDER BY any_value(date)) AS entry_page,
    FIRST_VALUE(page_url) OVER (PARTITION BY session_id ORDER BY any_value(date) DESC) AS exit_page
  FROM clickstream_customers_silver
  GROUP BY customer_id, session_id, page_url
)
SELECT
  sm.customer_id,
  COUNT(DISTINCT sm.session_id) AS total_sessions,
  AVG(sm.page_views) AS pages_per_session,
  MODE(sm.entry_page) AS common_entry_page,
  MODE(sm.exit_page) AS common_exit_page,
  MODE(cl.preferred_browser) AS preferred_browser
FROM session_metrics sm
LEFT JOIN (
  SELECT
    customer_id,
    parse_user_agent_browser_sql(user_agent) AS preferred_browser
  FROM clickstream_customers_silver
) cl ON sm.customer_id = cl.customer_id
GROUP BY sm.customer_id;