In [None]:
!pip install gradio
!pip install --upgrade -q accelerate bitsandbytes
!pip install git+https://github.com/huggingface/transformers.git
!pip install -q av

In [3]:
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_path = '/home/manish/final/LLaVA-NeXT-Video-7B-DPO-hf/models--llava-hf--LLaVA-NeXT-Video-7B-DPO-hf/snapshots/d30328aaf846b128f919353e18bd0cfe5fcdfd10'

processor = LlavaNextVideoProcessor.from_pretrained(model_path)
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config=quantization_config,
    device_map='auto'
)

  from .autonotebook import tqdm as notebook_tqdm
Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]


In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from pydantic import BaseModel
from typing import Dict
import torch
import warnings
import json

warnings.filterwarnings(action='ignore')

# Set random seed
torch.random.manual_seed(0)

# Model Loading for Logic Extraction
class ModelLoader:
    _model = None
    _tokenizer = None
    _pipe = None

    @classmethod
    def load_model(cls, model_path):
        if cls._model is None or cls._tokenizer is None:
            cls._model = AutoModelForCausalLM.from_pretrained(
                model_path,
                device_map="cuda",
                torch_dtype="auto",
                trust_remote_code=True,
            )
            cls._tokenizer = AutoTokenizer.from_pretrained(model_path)
            cls._pipe = pipeline(
                "text-generation",
                model=cls._model,
                tokenizer=cls._tokenizer,
            )
        return cls._pipe

generation_args = {
    "max_new_tokens": 50,
    "return_full_text": False,
    "temperature": 0.1,
    "do_sample": True
}

class LLMHelper:
    def __init__(self, pipeline):
        self.chatbot = pipeline

    def generate_logic(self, llm_output: str):
        prompt = f"""
        Provide the response in json string for the below keys and context based on the description: '{llm_output}'.
        
        Screen.interaction_yes: This field indicates whether there was an interaction of the person with a screen during the activity. A value of 1 means there was screen interaction (Yes), and a value of 0 means there was no screen interaction (No).
        Hands.free: This field indicates whether the person's hands were free during the activity. A value of 1 means the person was not holding anything (Yes), indicating free hands. A value of 0 means the person was holding something (No), indicating the hands were not free.
        Indoors: This field indicates whether the activity took place indoors. A value of 1 means the activity occurred inside a building or enclosed space (Yes), and a value of 0 means the activity took place outside (No).
        Standing: This field indicates whether the person was standing during the activity. A value of 1 means the person was standing (Yes), and a value of 0 means the person was not standing (No).
        """

        messages = [
            {"role": "system", "content": "Please answer questions just based on this information: " + llm_output},
            {"role": "user", "content": prompt},
        ]

        response = self.chatbot(messages, **generation_args)
        generated_text = response[0]['generated_text']
        # Extract JSON from the generated text
        start_index = generated_text.find('{')
        end_index = generated_text.rfind('}') + 1
        json_str = generated_text[start_index:end_index]
        return json_str

class VideoAnalysis(BaseModel):
    screen_interaction_yes: int
    hands_free: int
    indoors: int
    standing: int

    @classmethod
    def from_llm_output(cls, llm_output: str, generated_logic: str) -> 'VideoAnalysis':
        # Parse the generated logic (assuming it's a JSON string)
        logic_dict = json.loads(generated_logic)
        
        return cls(
            screen_interaction_yes=logic_dict.get("Screen.interaction_yes", 0),
            hands_free=logic_dict.get("Hands.free", 0),
            indoors=logic_dict.get("Indoors", 0),
            standing=logic_dict.get("Standing", 0)
        )

# Load the logic extraction model
model_path = "/home/manish/Super-Rapid-Annotator-Multimodal-Annotation-Tool/models/Phi3"
logic_pipeline = ModelLoader.load_model(model_path)
llm_helper = LLMHelper(logic_pipeline)



`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]


In [34]:
import gradio as gr
import numpy as np
import av
import os
import json

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (av.container.input.InputContainer): PyAV container.
        indices (List[int]): List of frame indices to decode.
    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def process_video(video_file, question):
    # Open video and sample frames
    with av.open(video_file.name) as container: # Access file name from Gradio input
        total_frames = container.streams.video[0].frames
        indices = np.arange(0, total_frames, total_frames / 8).astype(int)
        video_clip = read_video_pyav(container, indices)

    # Prepare conversation
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": f"{question}"},
                {"type": "video"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    # Prepare inputs for the model
    input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)

    # Generate output
    generate_kwargs = {"max_new_tokens": 3000, "do_sample": False, "top_p": 0.9}
    output = model.generate(**input, **generate_kwargs)
    generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
    
    return generated_text.split("ASSISTANT: ", 1)[-1].strip()


def process_videos(video_files, question):
    """Processes multiple videos and answers a single question for each."""
    answers = []
    for video_file in video_files:
        video_name = os.path.basename(video_file.name)
        answer = process_video(video_file, question)
        answers.append(f"**Video: {video_name}**\n{answer}\n")
    return "\n---\n".join(answers)





In [37]:
# Define Gradio interface for multiple videos
def gradio_interface(videos, indoors_outdoors, standing_sitting, hands_free, interacting_screen):
    question = "For each question, analyze the given video carefully and base your answers on the observations made."
    if indoors_outdoors:
        question += "Consider the broader environmental context shown in the video’s background. Are there signs of an open-air space, like greenery, structures, or people passing by? If so, it’s an outdoor setting. If the setting looks confined with furniture, walls, or home decorations, it’s an indoor environment."
    if standing_sitting:
        question += "Evaluate the subject’s body posture and movement within the video. Are they standing upright with both feet planted firmly on the ground? If so, they are standing."
    if hands_free:
        question += "Examine the subject’s right and left hands in the video to check if they are holding anything like a microphone, book, paper(White color), object, or any electronic device, try segmentations and decide if the hands are free or not."
    if interacting_screen:
        question += "Assess the surroundings behind the subject in the video. Do they seem to interact with any visible screens, such as laptops, TVs, or digital billboards? If yes, then they are interacting with a screen. If not, they are not interacting with a screen."
    question_prefix = "By taking these factors into account when watching the video, please answer the questions accurately."
    question = question + question_prefix 
    answers = process_videos(videos, question)
    return answers


iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(label="Upload Videos", file_count="multiple"),
        gr.Checkbox(label="Indoors or Outdoors", value=False),
        gr.Checkbox(label="Standing or Sitting", value=False),
        gr.Checkbox(label="Hands Free or Not", value=False),
        gr.Checkbox(label="Interacting with Screen", value=False),
    ],
    outputs=gr.Textbox(label="Generated Answers"),
    title="Video Question Answering",
    description="Upload multiple videos and select questions to get answers."
)

    
if __name__ == "__main__":
    iface.launch(debug=True, share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://a9658b152670d36655.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
