In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    TimestampType,
    LongType,
    DateType,
)
from datetime import datetime, timedelta
import os

In [2]:
spark = (
    SparkSession.builder.master("local[*]")
    .appName(name="unittest")
    .config("spark.ui.enabled", "false")
    .config(
        "spark.driver.extraJavaOptions",
        "--add-exports java.base/sun.nio.ch=ALL-UNNAMED",
    )
    .config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6",
    )
    .getOrCreate()
)

24/12/19 10:44:26 WARN Utils: Your hostname, hongong-predator resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/12/19 10:44:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/hongong/miniconda3/envs/goodnotes-insights-data-eng/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/hongong/.ivy2/cache
The jars for the packages stored in: /home/hongong/.ivy2/jars
org.apache.hadoop#hadoop-azure added as a dependency
com.microsoft.azure#azure-storage added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-08f04b03-3dbc-4a92-843d-966ade5785df;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-azure;3.3.1 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.11 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 in central
	found org.eclipse.jetty#jetty-util-ajax;9.4.40.v20210413 in central
	found org.eclipse.jetty#jetty-util;9.4.40.v20210413 in central
	found org.codehaus.jackson#jackson-mapper-asl;1.9.13 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.wildfly.openssl#wildfly

# Data preparation

In [35]:
class ETLPipeline:
    def __init__(self, spark):
        self.bucket_name = f"noteapp"
        self.spark = spark

        # Define storage paths
        self.bronze_path = f"{self.bucket_name}/bronze"
        self.silver_path = f"{self.bucket_name}/silver"
        self.gold_path = f"{self.bucket_name}/gold"

    def define_schemas(self):
        """Define schemas for the datasets"""
        self.interactions_schema = StructType(
            [
                StructField("user_id", StringType(), False),
                StructField("timestamp", TimestampType(), False),
                StructField("action_type", StringType(), False),
                StructField("page_id", StringType(), False),
                StructField("duration_ms", LongType(), False),
                StructField("app_version", StringType(), False),
            ]
        )

        self.metadata_schema = StructType(
            [
                StructField("user_id", StringType(), False),
                StructField("join_date", DateType(), False),
                StructField("country", StringType(), False),
                StructField("device_type", StringType(), False),
                StructField("subscription_type", StringType(), False),
            ]
        )

    def ingest_to_bronze(self, csv_path: str, dataset_type: str):
        """Ingest CSV files to bronze layer in parquet format"""
        schema = (
            self.interactions_schema
            if dataset_type == "interactions"
            else self.metadata_schema
        )

        df = self.spark.read.schema(schema).csv(csv_path)

        if dataset_type == "interactions":
            # Partition by date for interactions
            df = df.withColumn("partition_date", F.to_date("timestamp"))
            output_path = f"{self.bronze_path}/interactions"
            partition_by = ["partition_date"]
        else:
            # Partition by country for metadata
            output_path = f"{self.bronze_path}/metadata"
            partition_by = ["country"]

        df.write.mode("append").partitionBy(partition_by).parquet(output_path)

    def process_silver_layer(self, process_date: datetime = None):
        """
        Process bronze data into silver layer with cleaned and validated data.
        Supports incremental processing by date.

        Args:
            process_date: Optional date to process. If None, processes current date
        """
        if process_date is None:
            process_date = datetime.now().date()

        # Read only the partition we need from bronze interactions
        interactions_df = self.spark.read.option(
            "basePath", f"{self.bronze_path}/interactions"
        ).parquet(f"{self.bronze_path}/interactions/partition_date={process_date}")

        # For metadata, check if we need to process updates
        metadata_path = f"{self.silver_path}/dim_users"
        metadata_df = self.spark.read.parquet(f"{self.bronze_path}/metadata")

        # Get existing metadata last modified date if exists
        try:
            existing_metadata = self.spark.read.parquet(metadata_path)
            last_modified = existing_metadata.agg(F.max("_modified_date")).collect()[0][
                0
            ]
        except:
            last_modified = None
            existing_metadata = None

        # Process interactions incrementally
        clean_interactions = interactions_df.filter(
            F.col("duration_ms").between(0, 7200000)
        ).dropDuplicates(["user_id", "timestamp", "action_type", "page_id"])

        fact_interactions = clean_interactions.select(
            "user_id",
            "timestamp",
            "action_type",
            "page_id",
            "duration_ms",
            "partition_date",
        ).withColumn("_modified_date", F.current_date())

        # Write fact table incrementally
        fact_path = f"{self.silver_path}/fact_interactions"

        (
            fact_interactions.write.mode("append")  # Use append mode for incremental
            .partitionBy("partition_date")
            .option(
                "replaceWhere", f"partition_date = '{process_date}'"
            )  # Overwrite only this partition
            .parquet(fact_path)
        )

        # Process metadata changes
        if existing_metadata is not None:
            # Identify new or updated metadata records
            metadata_df = metadata_df.withColumn(
                "_modified_date", F.current_date()
            ).join(
                existing_metadata, "user_id", "left_anti"
            )  # Get only new records
        else:
            metadata_df = metadata_df.withColumn("_modified_date", F.current_date())

        if metadata_df.count() > 0:  # Only process if we have changes
            dim_users = metadata_df.dropDuplicates(["user_id"]).select(
                "user_id",
                "join_date",
                "country",
                "device_type",
                "subscription_type",
                "_modified_date",
            )

            # Write dimension table
            # For small dimension tables, we can use overwrite mode
            # For larger ones, consider using merge/upsert operations
            if existing_metadata is None:
                write_mode = "overwrite"
            else:
                write_mode = "append"

            (
                dim_users.write.mode(write_mode)
                .partitionBy("country")
                .parquet(metadata_path)
            )

        # Return metrics about processed data
        return {
            "date_processed": process_date,
            "interactions_processed": clean_interactions.count(),
            "metadata_updates": metadata_df.count() if metadata_df.count() > 0 else 0,
        }

    def process_date_range(self, start_date: datetime, end_date: datetime):
        """Process a range of dates incrementally"""
        current_date = start_date
        processing_metrics = []

        while current_date <= end_date:
            try:
                metrics = self.process_silver_layer(current_date)
                processing_metrics.append(metrics)
                current_date += timedelta(days=1)
            except Exception as e:
                print(f"Error processing date {current_date}: {str(e)}")
                raise

        return processing_metrics

    def cleanup_old_partitions(self, retention_days: int = 90):
        """Clean up old partitions based on retention policy"""
        cutoff_date = datetime.now().date() - timedelta(days=retention_days)

        # List partitions
        bronze_partitions = self.spark._jvm.org.apache.hadoop.fs.Path(
            f"{self.bronze_path}/interactions"
        )
        silver_partitions = self.spark._jvm.org.apache.hadoop.fs.Path(
            f"{self.silver_path}/fact_interactions"
        )

        # Delete old partitions
        fs = bronze_partitions.getFileSystem(self.spark._jsc.hadoopConfiguration())

        for path in [bronze_partitions, silver_partitions]:
            if fs.exists(path):
                for partition in fs.listStatus(path):
                    partition_date = datetime.strptime(
                        partition.getPath().getName().split("=")[1], "%Y-%m-%d"
                    ).date()

                    if partition_date < cutoff_date:
                        fs.delete(partition.getPath(), True)

    def _calculate_session_metrics(
        self,
        fact_interactions: DataFrame,
        process_date: datetime,
        lookback_days: int = 1,
    ) -> DataFrame:
        """
        Calculate session-based metrics with window functions, handling session boundaries.

        Args:
            fact_interactions: DataFrame of interactions
            process_date: Date to process
            lookback_days: Number of days to look back for ongoing sessions
        """
        # Calculate date range for session boundary handling
        start_date = process_date - timedelta(days=lookback_days)
        end_date = process_date + timedelta(days=1)  # Include full day

        # Create window specs without range specification for lag/lead
        user_window = Window.partitionBy("user_id").orderBy("timestamp")

        # Create window spec for cumulative operations
        cumulative_window = Window.partitionBy("user_id").orderBy("timestamp")

        sessions_df = (
            fact_interactions.filter(
                F.col("partition_date").between(start_date, process_date)
            )
            .withColumn("prev_timestamp", F.lag("timestamp").over(user_window))
            .withColumn(
                "time_diff_minutes",
                F.when(
                    F.col("prev_timestamp").isNotNull(),
                    (F.unix_timestamp("timestamp") - F.unix_timestamp("prev_timestamp"))
                    / 60,
                ).otherwise(0),
            )
            .withColumn(
                "is_new_session",
                F.when(F.col("time_diff_minutes") >= 30, 1).otherwise(0),
            )
            .withColumn(
                "session_id",
                F.concat(
                    F.col("user_id"),
                    F.lit("_"),
                    F.date_format("partition_date", "yyyyMMdd"),
                    F.lit("_"),
                    F.sum("is_new_session").over(cumulative_window),
                ),
            )
        )

        # Calculate metrics only for sessions that end on process_date
        return (
            sessions_df.withColumn(
                "next_timestamp", F.lead("timestamp").over(user_window)
            )
            .withColumn(
                "session_end",
                F.when(
                    F.col("next_timestamp").isNull()
                    | (
                        (
                            F.unix_timestamp("next_timestamp")
                            - F.unix_timestamp("timestamp")
                        )
                        / 60
                        >= 30
                    ),
                    True,
                ).otherwise(False),
            )
            .filter(
                (F.col("partition_date") == process_date)
                | (F.col("session_end") == True)
            )
            .groupBy("session_id")
            .agg(
                F.count("*").alias("actions_per_session"),
                F.sum("duration_ms").alias("session_duration_ms"),
                F.first("partition_date").alias("session_date"),
                F.last("timestamp").alias("session_end_time"),
            )
        )

    def create_gold_layer(self, process_date: datetime = None):
        """
        Create gold layer with pre-aggregated data and business metrics incrementally.

        Args:
            process_date: Date to process, defaults to current date
        """
        if process_date is None:
            process_date = datetime.now().date()

        # Load relevant data from silver layer
        fact_interactions = self.spark.read.option(
            "basePath", f"{self.silver_path}/fact_interactions"
        ).parquet(f"{self.silver_path}/fact_interactions/partition_date={process_date}")

        dim_users = self.spark.read.parquet(f"{self.silver_path}/dim_users")
        broadcast_users = F.broadcast(dim_users)

        # Calculate daily metrics
        daily_metrics = fact_interactions.groupBy("partition_date").agg(
            F.countDistinct("user_id").alias("daily_active_users"),
            F.count("*").alias("total_actions"),
            F.avg("duration_ms").alias("avg_duration_ms"),
        )

        # Update monthly metrics
        month_start = process_date.replace(day=1)
        month_end = (process_date + timedelta(days=32)).replace(day=1) - timedelta(
            days=1
        )

        # Read existing monthly metrics for current month if exists
        monthly_path = f"{self.gold_path}/monthly_metrics"
        try:
            existing_monthly = self.spark.read.option("basePath", monthly_path).parquet(
                f"{monthly_path}/month_date={month_start}"
            )
        except:
            existing_monthly = None

        # Calculate monthly metrics for current month
        month_interactions = (
            self.spark.read.option("basePath", f"{self.silver_path}/fact_interactions")
            .parquet(f"{self.silver_path}/fact_interactions")
            .filter(F.col("partition_date").between(month_start, month_end))
        )

        monthly_metrics = (
            month_interactions.withColumn(
                "month_date", F.date_trunc("month", F.col("partition_date"))
            )
            .groupBy("month_date")
            .agg(
                F.countDistinct("user_id").alias("monthly_active_users"),
                F.count("*").alias("total_monthly_actions"),
            )
        )

        # Calculate session metrics with lookback
        session_metrics = self._calculate_session_metrics(
            fact_interactions, process_date, lookback_days=1
        )

        # Write metrics to gold layer
        # Daily metrics - append mode with partition replacement
        (
            daily_metrics.write.mode("append")
            .partitionBy("partition_date")
            .option("replaceWhere", f"partition_date = '{process_date}'")
            .parquet(f"{self.gold_path}/daily_metrics")
        )

        # Monthly metrics - replace partition for current month
        (
            monthly_metrics.write.mode("append")
            .partitionBy("month_date")
            .option("replaceWhere", f"month_date = '{month_start}'")
            .parquet(monthly_path)
        )

        # Session metrics - append mode with date partitioning
        (
            session_metrics.write.mode("append")
            .partitionBy("session_date")
            .option("replaceWhere", f"session_date = '{process_date}'")
            .parquet(f"{self.gold_path}/session_metrics")
        )

        return {
            "date_processed": process_date,
            "daily_metrics_updated": daily_metrics.count(),
            "monthly_metrics_updated": monthly_metrics.count(),
            "sessions_processed": session_metrics.count(),
        }

    def backfill_gold_metrics(
        self, start_date: datetime, end_date: datetime, parallel: bool = False
    ):
        """
        Backfill gold metrics for a date range.

        Args:
            start_date: Start date for backfill
            end_date: End date for backfill
            parallel: Whether to process dates in parallel
        """
        if parallel:
            # Create list of dates to process
            dates = [
                (start_date + timedelta(days=x)).date()
                for x in range((end_date - start_date).days + 1)
            ]

            # Process dates in parallel using Spark
            date_df = self.spark.createDataFrame(
                [(date,) for date in dates], ["process_date"]
            )

            date_df.repartition(min(len(dates), 50)).foreach(
                lambda row: self.create_gold_layer(row.process_date)
            )
        else:
            current_date = start_date
            while current_date <= end_date:
                try:
                    self.create_gold_layer(current_date)
                    current_date += timedelta(days=1)
                except Exception as e:
                    print(f"Error processing {current_date}: {str(e)}")
                    raise

# Data processing

In [36]:
# Initialize ETL pipeline
etl = ETLPipeline(spark)
etl.define_schemas()

## Bronze layer: Ingest and convert to parquet

In [9]:
# Bronze layer: Ingest and convert to parquet
etl.ingest_to_bronze("data/user_interactions_sample.csv", "interactions")
etl.ingest_to_bronze("data/user_metadata_sample.csv", "metadata")

24/12/19 10:28:04 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [12]:
!ls -lah noteapp/bronze/interactions | head -10

total 1,5M
drwxr-xr-x 367 hongong hongong  28K Thg 12 19 10:28 .
drwxr-xr-x   4 hongong hongong 4,0K Thg 12 19 10:28 ..
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 partition_date=2023-01-01
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 partition_date=2023-01-02
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 partition_date=2023-01-03
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 partition_date=2023-01-04
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 partition_date=2023-01-05
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 partition_date=2023-01-06
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 partition_date=2023-01-07
ls: write error: Broken pipe


In [14]:
!ls -lah noteapp/bronze/interactions/partition_date=2023-01-01

total 184K
drwxr-xr-x   2 hongong hongong 4,0K Thg 12 19 10:28 .
drwxr-xr-x 367 hongong hongong  28K Thg 12 19 10:28 ..
-rw-r--r--   1 hongong hongong  14K Thg 12 19 10:28 part-00000-be2c411e-d248-43f3-a6af-4756d6a0e5d8.c000.snappy.parquet
-rw-r--r--   1 hongong hongong  116 Thg 12 19 10:28 .part-00000-be2c411e-d248-43f3-a6af-4756d6a0e5d8.c000.snappy.parquet.crc
-rw-r--r--   1 hongong hongong  13K Thg 12 19 10:28 part-00001-be2c411e-d248-43f3-a6af-4756d6a0e5d8.c000.snappy.parquet
-rw-r--r--   1 hongong hongong  112 Thg 12 19 10:28 .part-00001-be2c411e-d248-43f3-a6af-4756d6a0e5d8.c000.snappy.parquet.crc
-rw-r--r--   1 hongong hongong  14K Thg 12 19 10:28 part-00002-be2c411e-d248-43f3-a6af-4756d6a0e5d8.c000.snappy.parquet
-rw-r--r--   1 hongong hongong  120 Thg 12 19 10:28 .part-00002-be2c411e-d248-43f3-a6af-4756d6a0e5d8.c000.snappy.parquet.crc
-rw-r--r--   1 hongong hongong  14K Thg 12 19 10:28 part-00003-be2c411e-d248-43f3-a6af-4756d6a0e5d8.c000.snappy.parquet
-rw-r--r--   1 hongong ho

In [13]:
!ls -lah noteapp/bronze/metadata | head -10

total 56K
drwxr-xr-x 13 hongong hongong 4,0K Thg 12 19 10:28 .
drwxr-xr-x  4 hongong hongong 4,0K Thg 12 19 10:28 ..
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 country=AU
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 country=BR
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 country=CA
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 country=country
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 country=DE
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 country=FR
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 country=IN


In [15]:
!ls -lah noteapp/bronze/metadata/country=AU

total 104K
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:28 .
drwxr-xr-x 13 hongong hongong 4,0K Thg 12 19 10:28 ..
-rw-r--r--  1 hongong hongong  89K Thg 12 19 10:28 part-00000-536bb221-ff69-4025-9f45-72360c5cfde0.c000.snappy.parquet
-rw-r--r--  1 hongong hongong  716 Thg 12 19 10:28 .part-00000-536bb221-ff69-4025-9f45-72360c5cfde0.c000.snappy.parquet.crc


## Silver layer: Clean data and create fact/dimension tables

In [10]:
# Silver layer: Clean data and create fact/dimension tables
processing_date = datetime(2023, 1, 1).date()
etl.process_silver_layer(processing_date)

                                                                                

{'date_processed': datetime.date(2023, 1, 1),
 'interactions_processed': 2731,
 'metadata_updates': 100001}

In [13]:
!ls -lah noteapp/silver/dim_users

total 56K
drwxr-xr-x 13 hongong hongong 4,0K Thg 12 19 10:48  .
drwxr-xr-x  4 hongong hongong 4,0K Thg 12 19 10:48  ..
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=AU'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=BR'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=CA'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=country'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=DE'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=FR'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=IN'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=JP'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=MX'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=UK'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'country=US'
-rw-r--r--  1 hongong hongong    0 Thg 12 19 10:48  _SUCCESS
-rw-r--r--  1 hongong hongong    8 Thg 12 19 10:48  ._SUCCESS.crc


In [14]:
!ls -lah noteapp/silver/fact_interactions

total 16K
drwxr-xr-x 3 hongong hongong 4,0K Thg 12 19 10:48  .
drwxr-xr-x 4 hongong hongong 4,0K Thg 12 19 10:48  ..
drwxr-xr-x 2 hongong hongong 4,0K Thg 12 19 10:48 'partition_date=2023-01-01'
-rw-r--r-- 1 hongong hongong    0 Thg 12 19 10:48  _SUCCESS
-rw-r--r-- 1 hongong hongong    8 Thg 12 19 10:48  ._SUCCESS.crc


In [15]:
!ls -lah noteapp/silver/fact_interactions/partition_date=2023-01-01

total 88K
drwxr-xr-x 2 hongong hongong 4,0K Thg 12 19 10:48 .
drwxr-xr-x 3 hongong hongong 4,0K Thg 12 19 10:48 ..
-rw-r--r-- 1 hongong hongong  74K Thg 12 19 10:48 part-00000-9090108e-7af3-4a92-b1f5-7d1b48460395.c000.snappy.parquet
-rw-r--r-- 1 hongong hongong  596 Thg 12 19 10:48 .part-00000-9090108e-7af3-4a92-b1f5-7d1b48460395.c000.snappy.parquet.crc


## Silver layer: process multiple dates incrementally

In [31]:
start_date = datetime(2023, 1, 2).date()
end_date = datetime(2023, 2, 1).date()
etl.process_date_range(start_date, end_date)

[{'date_processed': datetime.date(2023, 1, 2),
  'interactions_processed': 2742,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 3),
  'interactions_processed': 2696,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 4),
  'interactions_processed': 2795,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 5),
  'interactions_processed': 2695,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 6),
  'interactions_processed': 2809,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 7),
  'interactions_processed': 2648,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 8),
  'interactions_processed': 2771,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 9),
  'interactions_processed': 2800,
  'metadata_updates': 0},
 {'date_processed': datetime.date(2023, 1, 10),
  'interactions_processed': 2780,
  'metadata_updates': 0},
 {'date_processed': datetime.date(20

In [32]:
!ls -lah noteapp/silver/fact_interactions

total 140K
drwxr-xr-x 34 hongong hongong 4,0K Thg 12 19 11:12  .
drwxr-xr-x  4 hongong hongong 4,0K Thg 12 19 10:48  ..
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 10:48 'partition_date=2023-01-01'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-02'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-03'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-04'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-05'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-06'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-07'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-08'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-09'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-10'
drwxr-xr-x  2 hongong hongong 4,0K Thg 12 19 11:12 'partition_date=2023-01-11'
drwxr-xr-x 

In [33]:
df_fact_int = spark.read.parquet(
    "noteapp/silver/fact_interactions/partition_date=2023-01-03"
)
df_fact_int.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- action_type: string (nullable = true)
 |-- page_id: string (nullable = true)
 |-- duration_ms: long (nullable = true)
 |-- _modified_date: date (nullable = true)



In [20]:
df_fact_int.count()

2696

In [21]:
df_fact_int.show()

+-------+-------------------+-----------+-------+-----------+--------------+
|user_id|          timestamp|action_type|page_id|duration_ms|_modified_date|
+-------+-------------------+-----------+-------+-----------+--------------+
|u880939|2023-01-03 12:28:38|      share|p891876|     284189|    2024-12-19|
|u483517|2023-01-03 14:01:43|      share|p207145|     101050|    2024-12-19|
|u647684|2023-01-03 04:29:05|       edit|p327225|      19413|    2024-12-19|
|u531187|2023-01-03 12:55:55|       edit|p370054|     152019|    2024-12-19|
|u080903|2023-01-03 16:55:57|       edit|p709243|      46027|    2024-12-19|
|u173160|2023-01-03 07:20:39|     delete|p512034|     135144|    2024-12-19|
|u126441|2023-01-03 19:43:41|     create|p500152|     256895|    2024-12-19|
|u567436|2023-01-03 01:06:09|       edit|p484945|     252431|    2024-12-19|
|u496595|2023-01-03 22:33:29|      share|p229867|      10619|    2024-12-19|
|u179135|2023-01-03 21:47:15|     create|p224419|     197715|    2024-12-19|

# Gold layer: Create pre-aggregated business metrics

In [37]:
# Process single date
processing_date = datetime(2023, 1, 15).date()
metrics = etl.create_gold_layer(processing_date)
metrics

{'date_processed': datetime.date(2023, 1, 15),
 'daily_metrics_updated': 1,
 'monthly_metrics_updated': 1,
 'sessions_processed': 2746}

In [38]:
!ls -lah noteapp/gold

total 20K
drwxr-xr-x 5 hongong hongong 4,0K Thg 12 19 11:27 .
drwxr-xr-x 5 hongong hongong 4,0K Thg 12 19 11:27 ..
drwxr-xr-x 3 hongong hongong 4,0K Thg 12 19 11:27 daily_metrics
drwxr-xr-x 3 hongong hongong 4,0K Thg 12 19 11:27 monthly_metrics
drwxr-xr-x 3 hongong hongong 4,0K Thg 12 19 11:27 session_metrics


In [39]:
!ls -lah noteapp/gold/session_metrics

total 16K
drwxr-xr-x 3 hongong hongong 4,0K Thg 12 19 11:27  .
drwxr-xr-x 5 hongong hongong 4,0K Thg 12 19 11:27  ..
drwxr-xr-x 2 hongong hongong 4,0K Thg 12 19 11:27 'session_date=2023-01-15'
-rw-r--r-- 1 hongong hongong    0 Thg 12 19 11:27  _SUCCESS
-rw-r--r-- 1 hongong hongong    8 Thg 12 19 11:27  ._SUCCESS.crc


In [40]:
df_sess_metrics = spark.read.parquet("noteapp/gold/session_metrics/")
df_sess_metrics.printSchema()

root
 |-- session_id: string (nullable = true)
 |-- actions_per_session: long (nullable = true)
 |-- session_duration_ms: long (nullable = true)
 |-- session_end_time: timestamp (nullable = true)
 |-- session_date: date (nullable = true)



In [41]:
df_sess_metrics.show()

+------------------+-------------------+-------------------+-------------------+------------+
|        session_id|actions_per_session|session_duration_ms|   session_end_time|session_date|
+------------------+-------------------+-------------------+-------------------+------------+
|u150209_20230115_0|                  1|             235478|2023-01-15 11:39:30|  2023-01-15|
|u212067_20230115_0|                  1|              82707|2023-01-15 06:16:18|  2023-01-15|
|u294155_20230115_0|                  1|              12907|2023-01-15 21:46:11|  2023-01-15|
|u296373_20230115_0|                  1|             222310|2023-01-15 08:49:20|  2023-01-15|
|u308654_20230115_0|                  1|              69602|2023-01-15 12:10:26|  2023-01-15|
|u384261_20230115_0|                  1|             270021|2023-01-15 13:40:07|  2023-01-15|
|u399815_20230115_0|                  1|             290451|2023-01-15 10:47:36|  2023-01-15|
|u482058_20230115_0|                  1|              28435|