In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)

Load the .mp4 files using binaryFile format and glob filter

In [2]:
import os
df = spark.read.format("binaryFile").option("pathGlobFilter", "*.mp4").load(os.path.join("/opt/workspace", "media/video"))
df.show(5)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/opt/workspa...|2023-03-16 15:17:52|297293|[00 00 00 20 66 7...|
|file:/opt/workspa...|2023-03-16 15:17:52|155292|[00 00 00 20 66 7...|
|file:/opt/workspa...|2023-03-16 15:17:52|130196|[00 00 00 20 66 7...|
|file:/opt/workspa...|2023-03-16 15:17:52|119643|[00 00 00 20 66 7...|
|file:/opt/workspa...|2023-03-16 15:17:52| 55801|[00 00 00 20 66 7...|
+--------------------+-------------------+------+--------------------+


Extract the number of frames, fps and bitrate from the video files

In [31]:
import pyspark.sql.functions as F
from pyspark.sql.pandas.functions import pandas_udf
import pyspark.sql.types as T
import pandas as pd

@pandas_udf(returnType=T.MapType(T.StringType(), T.IntegerType()))
def extract_number_of_frames(file_paths: pd.Series) -> pd.Series:
    import cv2
    def fn(path):
        cap = cv2.VideoCapture(path.replace("file:", ""))
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        bitrate = int(cap.get(cv2.CAP_PROP_BITRATE))
        return {"frame_count": frame_count, "fps": fps, "bitrate": bitrate}

    return pd.Series([fn(path) for path in file_paths.tolist()])


features_df = df.withColumn("video_features", extract_number_of_frames(F.col("path")))
features_df.select(F.col("video_features")).show(5, truncate=False)

+-----------------------------------------------+
|video_features                                 |
+-----------------------------------------------+
|{frame_count -> 32, fps -> 10, bitrate -> 743} |
|{frame_count -> 13, fps -> 10, bitrate -> 955} |
|{frame_count -> 10, fps -> 10, bitrate -> 1041}|
|{frame_count -> 8, fps -> 10, bitrate -> 1196} |
|{frame_count -> 4, fps -> 10, bitrate -> 1116} |
+-----------------------------------------------+
