Requirements

In [9]:
!pip install ultralytics supervision easyocr opencv-python-headless



Imports

In [10]:
import numpy as np
from ultralytics import YOLO
import time
import cv2
import easyocr
import json


main function

In [11]:




def main(src_path, output_json, hist_diff_threshold = 0.8, ocr_confidence_threshold = 0.6, word_count = 2, cut_frames_threshold = 10, ocr_div = 2,):


    model = YOLO("yolov8m.pt")
    ocr = easyocr.Reader(['en'], gpu=True)

    people_list = []
    object_list = []

    prev_frame_color_histogram = None

    cut_count = 0




    cut_frames_count = 0


    prev_resized_optical_flow = None

    current_motion = 0

    motions = []


    keywords = []


    text_frames_count = 0


    prev_ocr_time = 0



    cap = cv2.VideoCapture(src_path)



    if (cap.isOpened() == False):
        print("Error reading video file")

    ocr_interval= cap.get(cv2.CAP_PROP_FPS) // ocr_div
    try:

        frameCount = 1
        while(cap.isOpened()):
            ret, frame = cap.read()
            if ret == True:
                current_hist = cv2.calcHist([frame], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
                cv2.normalize(current_hist, current_hist)

                cut_flag = False

                if prev_frame_color_histogram is not None:
                    hist_difference = cv2.compareHist(prev_frame_color_histogram, current_hist, cv2.HISTCMP_CORREL)

                    if hist_difference < hist_diff_threshold and cut_frames_count >= cut_frames_threshold:
                        cut_count += 1
                        cut_flag = True
                        cut_frames_count = 0
                        prev_resized_optical_flow = None
                prev_frame_color_histogram = current_hist




                gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                resized_frame = cv2.resize(gray_frame, (320, 180))

                if prev_resized_optical_flow is not None and not cut_flag:

                    optical_flow = cv2.calcOpticalFlowFarneback(prev_resized_optical_flow, resized_frame, None, 0.5, 3, 15, 3, 5, 1.2, 0);
                    magnitude, angle = cv2.cartToPolar(optical_flow[..., 0], optical_flow[..., 1])
                    current_motion = np.mean(magnitude)

                else:
                    current_motion = 0.0
                prev_resized_optical_flow = resized_frame
                motions.append(current_motion)

                text_flag= False
                temp_text_list = []

                if frameCount % ocr_interval == 0:
                    ocr_results = ocr.readtext(frame)


                    for (boundary_box, text, confidence) in ocr_results:

                        if confidence > ocr_confidence_threshold:

                            text_flag= True

                            if text.strip() not in keywords:
                                keywords.append(text.strip())

                            temp_text_list.append(text.strip())


                if text_flag:
                    text_frames_count += ocr_interval

                    prev_keywords = " ".join(temp_text_list[:word_count])

                else:
                    prev_keywords = ""

                results = model.track(frame, persist=True, verbose=False, tracker="bytetrack.yaml")
                result = results[0]

                if result.boxes is not None and result.boxes.id is not None:
                    boxes = result.boxes.xyxy.cpu().numpy().astype(int)
                    track_ids = result.boxes.id.cpu().numpy().astype(int)
                    class_ids = result.boxes.cls.cpu().numpy().astype(int)

                    for box, track_id, class_id in zip(boxes, track_ids, class_ids):
                        if class_id == 0:
                            if track_id not in people_list:
                                people_list.append(track_id)

                        else:
                            if track_id not in object_list:
                                object_list.append(track_id)



                if cut_frames_count < cut_frames_threshold:
                    cut_frames_count += 1


            else:
                print("Video stream successfully ended. Exiting..")
                break

            frameCount+=1







    except:

        print("Video has ended.")

    cap.release()


    if len(motions) > 0:
        avg_motion = np.mean(motions)
    else:
        avg_motion = 0.0

    if frameCount > 0 :
        text_present_ratio = (text_frames_count / frameCount) * 100

    else:
        text_present_ratio = 0.0


    output =  {
    "People versus objects ratio": f'{int(len(people_list))} : { int(len(object_list)) }',
    "No. of hard cuts": cut_count,

    "Mean motion": f" {float(round(avg_motion,4))}",

    "Text Presence Ratio": f" {text_present_ratio:.2f}% )",

    }

    if len(keywords) > 0:
        output["keywords"] = keywords

    with open(output_json, 'w') as f:
        json.dump(output, f, indent=4)


    print(json.dumps(output, indent=4))







# Set all your necessary thresholds and values
src_path -> Path to your local video file;
hist_diff_threshold -> How much sensitivity you want for histogram difference; ocr_confidence_threshold -> Confidence above which texts get detected; word_count -> How many of the detected words you want to show ; cut_frames_threshold -> For how many frames to evaluate for hard cuts

In [12]:
if __name__ == "__main__":

    src_path = '/content/input_video.mp4'

    output_json = '/content/features.json'


    hist_diff_threshold = 0.8

    ocr_confidence_threshold = 0.6

    word_count = 2

    cut_frames_threshold = 10

    ocr_div = 2


    main(src_path, output_json, hist_diff_threshold, ocr_confidence_threshold, word_count, cut_frames_threshold, ocr_div)



[31m[1mrequirements:[0m Ultralytics requirement ['lap>=0.5.12'] not found, attempting AutoUpdate...
Using Python 3.12.12 environment at: /usr
Resolved 2 packages in 203ms
Prepared 1 package in 59ms
Installed 1 package in 5ms
 + lap==0.5.12

[31m[1mrequirements:[0m AutoUpdate success ✅ 0.7s

Video stream successfully ended. Exiting..
{
    "People versus objects ratio": "1 : 4",
    "No. of hard cuts": 2,
    "Mean motion": " 0.5498",
    "Text Presence Ratio": " 0.00% )"
}
