# Track 2: Human Action Recognition using NVIDIA VLM Workflow


API Key: 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

In [None]:
!pip install gradio



In [None]:
import requests
import base64
from moviepy.video.io.VideoFileClip import VideoFileClip
from PIL import Image
from io import BytesIO
import gradio as gr


API_URL = "https://ai.api.nvidia.com/v1/vlm/nvidia/neva-22b"
API_KEY = "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"



def extract_frames(video_path, num_frames=16):
    video_clip = VideoFileClip(video_path)
    duration = video_clip.duration
    return [
        Image.fromarray(video_clip.get_frame((i / num_frames) * duration))
        for i in range(num_frames)
    ]


# convertig the image frame to Base64-encoded string

def frame_to_base64(image_frame):
    buffer = BytesIO()
    image_frame.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")



def query_action_detection(encoded_image, activity_description): #querying NVIDIA API
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Accept": "application/json",
    }
    payload = {
        "messages": [
            {
                "role": "user",
                "content": (
                    f'Do you see someone performing "{activity_description}" in this image? '
                    f'<img src="data:image/png;base64,{encoded_image}" />'
                ),
            }
        ],
        "max_tokens": 1024,
        "temperature": 0.2,
        "top_p": 0.7,
        "seed": 0,
        "stream": False,
    }

    response = requests.post(API_URL, headers=headers, json=payload)
    response_data = response.json()

    return "yes" in response_data.get("choices", [{}])[0].get("message", {}).get("content", "").lower()




def compute_detection_accuracy(frames, activity_description):
    detected_count = 0
    for frame in frames:
        encoded_frame = frame_to_base64(frame)
        if query_action_detection(encoded_frame, activity_description):
            detected_count += 1
    return (detected_count / len(frames)) * 100



def process_videos(video1_path, video2_path, action):
    try:

        frames_video1 = extract_frames(video1_path)
        frames_video2 = extract_frames(video2_path)


        accuracy_video1 = compute_detection_accuracy(frames_video1, action)
        accuracy_video2 = compute_detection_accuracy(frames_video2, action)

        return (
            f"Detection Accuracy for Synthetic Video: {accuracy_video1:.2f}%",
            f"Detection Accuracy for Real Video: {accuracy_video2:.2f}%",
        )
    except Exception as e:
        return f"Error processing videos: {str(e)}", None



def build_interface():
    with gr.Blocks() as app:
        gr.Markdown("### Compare Action Detection in Videos")

        with gr.Row():
            video1_input = gr.Video(label="Upload Synthetic Video")
            video2_input = gr.Video(label="Upload Real Video")
            action_input = gr.Textbox(label="Enter the Action (e.g., jumping, running)")

        results_synthetic = gr.Textbox(label="Synthetic Video Analysis")
        results_real = gr.Textbox(label="Real Video Analysis")

        submit_button = gr.Button("Analyze Videos")

        submit_button.click(
            fn=process_videos,
            inputs=[video1_input, video2_input, action_input],
            outputs=[results_synthetic, results_real],
        )

    return app


if __name__ == "__main__":
    build_interface().launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9920ad2e4f601c0e5d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
