Run this code on Kaggle using a P100 GPU if you don't have high-end hardware resources.

In [None]:
!pip install git+https://github.com/huggingface/transformers accelerate flash_attn
!pip install qwen_vl_utils av
!pip install streamlit pyngrok

In [None]:
%%writefile app.py

import streamlit as st
import tempfile
import torch

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load model and processor
st.title("FRAME-BASED VIDEO QUESTION ANSWERING SYSTEM")

@st.cache_resource(show_spinner=False)
def load_model_and_processor():
    model_name = "Qwen/Qwen2-VL-2B-Instruct"
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name, torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained(model_name)

    # Check if CUDA is available and set the device accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    return model, processor, device

model, processor, device = load_model_and_processor()

# File uploader and question input
video_file = st.file_uploader("Upload a video", type=["mp4"])
question = st.text_input("Enter your question")

if st.button("Submit"):
    if video_file and question:
        # Save the uploaded video to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
            temp_file.write(video_file.read())
            temp_file_path = temp_file.name  # Capture the path

        # Define the message structure
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": temp_file_path,
                        "max_pixels": 512 * 512,
                        "fps": 1.0,
                    },
                    {"type": "text", "text": question},
                ],
            }
        ]

        # Prepare the input for the model
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        # Move inputs to the same device as the model
        inputs = inputs.to(device)

        # Generate the response
        with torch.no_grad():  # To avoid unnecessary gradient computations
            generated_ids = model.generate(**inputs, max_new_tokens=512)

        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]

        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        # Display the output text
        if output_text:
            st.write(output_text[0])
        else:
            st.write("No response generated.")
    else:
        st.write("Please upload a video and enter a question.")

In [None]:
!ngrok authtoken "Add your authtication ngrok token"
#https://dashboard.ngrok.com/get-started/your-authtoken

In [None]:
# Run the Streamlit app and expose it via ngrok
import subprocess
from pyngrok import ngrok

# Start Streamlit in the background
process = subprocess.Popen(['streamlit', 'run', 'app.py'])

# Create an ngrok tunnel
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at: {public_url}")
