# Bronze â€” Conveyor Multimodal Ops (Fabric Lakehouse)
Indexes raw conveyor videos in OneLake (Lakehouse Files) and writes Delta table `bronze_video_index`.

**Upload paths expected**
- `Files/bronze/videos/conveyer/raw_mp4/` (MP4/MKV)
- `Files/bronze/labels/conveyer/anomaly/` (optional JPG)
- `Files/bronze/labels/conveyer/normal/` (optional JPG)


In [None]:
from pyspark.sql import functions as F

BRONZE_VIDEOS_SPARK = "Files/bronze/videos/conveyer/raw_mp4"
ANOM_LABELS_SPARK   = "Files/bronze/labels/conveyer/anomaly"
NORM_LABELS_SPARK   = "Files/bronze/labels/conveyer/normal"  # optional

print("BRONZE_VIDEOS_SPARK:", BRONZE_VIDEOS_SPARK)

## Scan for videos in Bronze

In [None]:
mp4 = (spark.read.format("binaryFile").option("pathGlobFilter","*.mp4")
       .load(BRONZE_VIDEOS_SPARK).select("path"))
mkv = (spark.read.format("binaryFile").option("pathGlobFilter","*.mkv")
       .load(BRONZE_VIDEOS_SPARK).select("path"))

videos = mp4.unionByName(mkv, allowMissingColumns=True).distinct()
display(videos.limit(20))
print("Found videos:", videos.count())

## Create `bronze_video_index`

In [None]:
bronze_video_index = (videos
    .withColumn("video_file", F.regexp_extract("path", r"([^/]+)$", 1))
    .withColumn("video_id", F.regexp_extract("video_file", r"^(.*)\.(mp4|mkv)$", 1))
    .withColumn(
        "video_path",
        F.when(
            F.col("path").startswith("/lakehouse/default/Files/"),
            F.regexp_replace(F.col("path"), r"^/lakehouse/default/Files/", "Files/")
        ).otherwise(F.col("path"))
    )
    .select("video_id","video_path")
    .dropDuplicates(["video_id"])
)

bronze_video_index.write.mode("overwrite").format("delta").saveAsTable("bronze_video_index")
display(bronze_video_index.limit(50))
print("Wrote bronze_video_index:", bronze_video_index.count())

## Sanity check labeled frames

In [None]:
anom = (spark.read.format("binaryFile").option("pathGlobFilter","*.jpg")
        .load(ANOM_LABELS_SPARK).select("path"))
print("Anomaly label frames:", anom.count())
display(anom.limit(10))

try:
    norm = (spark.read.format("binaryFile").option("pathGlobFilter","*.jpg")
           .load(NORM_LABELS_SPARK).select("path"))
    print("Normal label frames:", norm.count())
    display(norm.limit(10))
except Exception as e:
    print("Normal labels not found (ok):", e)