In [None]:
import logging
from label_data_as_viral import compute_engagement
from video_processor import VideoProcessor
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# Retrieve bronze table info
bronze_path = dbutils.jobs.taskValues.get(taskKey="01_bronze_apify_ingest", key="bronze_path")
dataset_id = dbutils.jobs.taskValues.get(taskKey="01_bronze_apify_ingest", key="dataset_id")

logger.info(f"Processing dataset_id={dataset_id}")

# Load bronze data
bronze_df = spark.table(bronze_path)
bronze_df = bronze_df.filter(bronze_df.dataset_id == dataset_id)

In [None]:
# Convert to Pandas for business logic
pdf = bronze_df.toPandas()
logger.info(f"üè∑Ô∏è Running viral labeling process for dataset_id {dataset_id}...")
labelled_df = compute_engagement(pdf)
logger.info(f"‚úÖ Labeling complete.")
logger.info(f"‚úÖ Labeled data with {len(labelled_df)} records.")
labelled_df

In [None]:
# Upload videos
vp = VideoProcessor()
logger.info(f"üìπ Extracting videos for dataset_id {dataset_id}...")
transformed_df = vp.upload_video_df(labelled_df)
transformed_df

In [None]:
# Save to silver Delta table
silver_path = "workspace.test.nus_silver_instagram_transformed"
spark_df = spark.createDataFrame(transformed_df)
spark_df.write.format("delta").mode("append").saveAsTable(silver_path)

# Pass dataset_id down to the last task for reference
dbutils.jobs.taskValues.set(key="silver_path", value=silver_path)
dbutils.jobs.taskValues.set(key="dataset_id", value=dataset_id)

logger.info(f"Silver table written to {silver_path} for dataset_id={dataset_id}")
