# ***Instalation***

In [1]:
!pip install --upgrade --quiet google-genai

In [2]:
!pip install SpeechRecognition
!pip install opencv-python speechrecognition numpy moviepy

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.13.0-py3-none-any.whl.metadata (30 kB)
Downloading SpeechRecognition-3.13.0-py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.13.0


# ***Main Code***

In [5]:
import google.generativeai as genai
from google.colab import files
import speech_recognition as sr
import moviepy.editor as mp
from PIL import Image
import requests
import cv2
import os
from google.colab import userdata


API_KEY = userdata.get('GEMINI_API_KEY')
genai.configure(api_key=API_KEY)
gpt_model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-1219')

def process_video(file_path):
    try:
        if file_path.startswith("http"):
            file_path = "downloaded_video.mp4"
            with open(file_path, 'wb') as file:
                file.write(requests.get(file_path).content)

        video = cv2.VideoCapture(file_path)
        if not video.isOpened():
            return {"error": "Failed to load video"}

        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_rate = int(video.get(cv2.CAP_PROP_FPS))
        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

        audio_text = "No audio"
        try:
            video_clip = mp.VideoFileClip(file_path)
            if video_clip.audio:
                video_clip.audio.write_audiofile("audio.wav")
                with sr.AudioFile("audio.wav") as source:
                    audio_data = sr.Recognizer().record(source)
                    audio_text = sr.Recognizer().recognize_google(audio_data)
        except:
            pass

        frame_descriptions = []
        for frame_num in range(min(5, total_frames // frame_rate)):
            video.set(cv2.CAP_PROP_POS_FRAMES, frame_num * frame_rate)
            success, frame = video.read()
            if success:
                try:
                    description = gpt_model.generate_content([
                        "Describe this video frame:",
                        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                    ]).text
                    frame_descriptions.append(description)
                except:
                    frame_descriptions.append("Error processing frame")

        video.release()

        os.remove("audio.wav") if os.path.exists("audio.wav") else None
        if file_path == "downloaded_video.mp4":
            os.remove(file_path)

        return {
            "total_frames": total_frames,
            "frame_rate": frame_rate,
            "width": width,
            "height": height,
            "frame_descriptions": frame_descriptions,
            "audio_text": audio_text
        }
    except Exception as e:
        return {"error": str(e)}

def main():
    uploaded_file = files.upload()
    if not uploaded_file:
        print("No file uploaded.")
        return

    result = process_video(list(uploaded_file.keys())[0])
    if "error" in result:
        print(f"Error: {result['error']}")
        return

    print(f"Video Info: {result['total_frames']} frames, {result['frame_rate']} FPS, "
          f"Dimensions: {result['width']}x{result['height']}")
    print("\nFrame Descriptions:")
    for i, description in enumerate(result['frame_descriptions'], 1):
        print(f"Frame {i}: {description}")
    print("\nAudio Transcript:\n", result['audio_text'])

    while True:
        query = input("\n Enter 'exit' to close programme : ").lower()
        if query == 'exit':
            break

        prompt = (
            f"Video Info: {result['total_frames']} frames, {result['frame_rate']} FPS, "
            f"Dimensions: {result['width']}x{result['height']}. "
            f"Frame Descriptions: {' '.join(result['frame_descriptions'])} "
            f"Audio Transcript: {result['audio_text']} User Query: {query}"
        )

        try:
            response = gpt_model.generate_content(prompt).text
            print("\nResponse:", response)
        except Exception as e:
            print(f"Error: {e}")

if __name__ == "__main__":
    main()


Saving A Lone Traveler's Mysterious Discovery short part.mp4 to A Lone Traveler's Mysterious Discovery short part.mp4
MoviePy - Writing audio in audio.wav




MoviePy - Done.
Video Info: 688 frames, 30 FPS, Dimensions: 720x1280

Frame Descriptions:
Frame 1: The user wants me to describe a video frame. The frame contains the word "If" in the center of an otherwise completely black background. I should describe the text and the background.
The video frame is entirely black except for the word "If" displayed in the center. The text is in a simple, sans-serif font and is a light gray color, contrasting against the dark background.
Frame 2: The user wants a description of the video frame. I need to describe the scene, the objects, and the people/animals present in the frame. I will pay attention to the details and try to provide a comprehensive description.
The video frame shows a person standing with a dog in what appears to be an abandoned or unfinished building. The environment is somewhat rough and industrial.

In the foreground, the concrete floor is visible with some cracks and uneven surfaces.

In the middle ground, a person with dark hair