<a href="https://colab.research.google.com/github/m-hasan-n/Tutorials/blob/main/Video_Understanding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install moviepy transformers torchvision torchaudio

In [None]:
!pip install kora -q
from kora import drive
drive.link_nbs()

!pip install import-ipynb
import import_ipynb

In [None]:
import os
data_dir = '/content/drive/MyDrive/MultiModalAI'
video_path = os.path.join(data_dir, "car_tyres.mp4")
device = "cuda"

In [None]:
!pip install -q ffmpeg-python

from transformers import pipeline
import torchaudio
import ffmpeg
import torch
import io

# Use ffmpeg to extract audio directly into memory as bytes
def extract_audio_from_video(video_path, sr=16000):
    out, _ = (
        ffmpeg
        .input(video_path)
        .output("pipe:", format="wav", ac=1, ar=sr)
        .run(capture_stdout=True, capture_stderr=True)
    )
    audio_tensor, sample_rate = torchaudio.load(io.BytesIO(out))
    return audio_tensor.squeeze(0), sample_rate  # squeeze to remove extra channel dim

# Extract audio
audio_tensor, sample_rate = extract_audio_from_video(video_path)

# Load Whisper model via HF pipeline
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Transcribe
transcription = asr(audio_tensor.numpy(), return_timestamps=True)
print("Transcript:", transcription["text"])


In [None]:
from PIL import Image
import moviepy.editor as mp

# Load video
clip = mp.VideoFileClip(video_path)

# Extract and save one frame every 2 seconds.
skip=2
extrated_frames = []
for t in range(0, int(clip.duration), skip):
    frame = clip.get_frame(t)
    img = Image.fromarray(frame).convert('RGB')
    extrated_frames.append(img)

In [None]:
print(f'For the video duration of {clip.duration}s, extracting a frame every {skip}s resulted in {len(extrated_frames)} key frames.')


In [None]:
#  Describe Video Frames with BLIP
from transformers import BlipProcessor, BlipForConditionalGeneration

# load the pretrained caption model and caption processor
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# frame captions
frame_captions = []
for img in extrated_frames:
    inputs = caption_processor(images=img, return_tensors="pt").to(device)
    out = caption_model.generate(**inputs)
    caption = caption_processor.decode(out[0], skip_special_tokens=True)
    frame_captions.append(caption)

In [None]:
print("Frame Captions:", frame_captions)

In [None]:
# Summarize the image captions
# get unique captions in the from the embeddings
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(frame_captions, convert_to_tensor=True)
sim_threshold = 0.9
unique_captions = []
for i, emb in enumerate(embeddings):
    if not any(util.pytorch_cos_sim(emb, e) > sim_threshold for e in model.encode(unique_captions, convert_to_tensor=True)):
        unique_captions.append(frame_captions[i])

In [None]:

print(len(unique_captions))
print("Unique Frame Captions:", unique_captions)

In [None]:
!pip install -q accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load Phi-2 model and tokenizer
model_id = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto"
)

# Construct prompt
prompt = f"""### Instruction:
Summarize the video using both the visual description and spoken transcript.

Visual Description: {'; '.join(unique_captions)}

Spoken Transcript: {transcription['text']}

Video Summary:"""

# Generate output
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=150)

output = pipe(prompt)[0]['generated_text']

In [17]:

print(output)

### Instruction:
Summarize the video using both the visual description and spoken transcript.

Visual Description: a man kneeling down to a car with leaves on the ground; a man is fixing a car tire; a man changing the tire on a car; a man is trying to get into a car; a white car with a black hood and hood; a man is opening the trunk of his car; a man is holding a cell phone in his hand; a man is holding a bottle of wine; a car is parked on the side of the road; a close up of a tire on a car; a person is using a small device to remove a piece of metal; a person is using a small device to test the compressor; a person is using a small orange hose to connect a small orange hose; a person is holding a small orange object; a person is using a small orange hose to fix a compressor; a man kneeling down to fix a car; a man is opening the door of his car; a person is holding a car key in their hand; a person is holding the steering wheel and pressing the button; a person pressing a button on a 

In [None]:
# Summarize with GPT
from transformers import AutoTokenizer, AutoModelForCausalLM

# Using GPT-2
gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt_model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)

video_description = f"""
This video contains the following scenes:
{'; '.join(unique_captions)}

The audio transcript is:
"{transcription}"

Summarize what is likely happening in this video.
"""

# Encode input with attention mask
inputs = gpt_tokenizer.encode_plus(video_description, return_tensors="pt").to(device)
outputs = gpt_model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=300, do_sample=True)

In [None]:
print("🧠 GPT Summary:", gpt_tokenizer.decode(outputs[0], skip_special_tokens=True))
