In [7]:
!pip install transformers




In [8]:
!pip install torch torchvision torchaudio




In [9]:
!pip install --upgrade ipywidgets




In [10]:
#this is for only one image



from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Load the model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load an image
image_path = "E:\\cat.jpg"  # Change this to any image
image = Image.open(image_path).convert("RGB")

# Generate a caption
inputs = processor(image, return_tensors="pt")
output = model.generate(**inputs)
caption = processor.decode(output[0], skip_special_tokens=True)

print("Generated Caption:", caption)


Generated Caption: a man sitting at a table with a cat


In [11]:
!pip install colorama




In [14]:
!pip install googletrans==4.0.0-rc1


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,

In [24]:
import os
import cv2
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from googletrans import Translator

# Load BLIP Model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Initialize Translator
translator = Translator()

# Define paths
image_folder = "E:\\images"   # Folder with images
video_path = "E:\\videos.mp4"  # Path to video
frames_folder = "E:\\videos\\frames"
output_file = "captions_combined.txt"

# Ensure folders exist
os.makedirs(frames_folder, exist_ok=True)

# Function to generate captions for an image
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    output = model.generate(**inputs)
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

# Function to translate captions into multiple languages
def translate_caption(caption):
    languages = {
        "ta": "Tamil 🇮🇳", "kn": "Kannada 🇮🇳", "te": "Telugu 🇮🇳", 
        "ml": "Malayalam 🇮🇳", "hi": "Hindi 🇮🇳"
    }
    translated_captions = { "English": caption }
    
    for lang_code, lang_name in languages.items():
        translated_captions[lang_name] = translator.translate(caption, dest=lang_code).text
    
    return translated_captions

# Store captions
captions_list = []

### **📸 Process Images**
for image_name in os.listdir(image_folder):
    if image_name.lower().endswith((".jpg", ".png", ".jpeg")):
        image_path = os.path.join(image_folder, image_name)
        caption = generate_caption(image_path)
        translated_captions = translate_caption(caption)

        # Save captions
        captions_list.append(f"🖼️ {image_name}")
        for lang, trans_caption in translated_captions.items():
            captions_list.append(f"🌍 {lang}: {trans_caption}")
        captions_list.append("-" * 50)

### **🎥 Process Video Frames**
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))  # Get FPS
frame_interval = 5 * fps  # Capture every 5 seconds

frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Extract frame at intervals
    if frame_count % frame_interval == 0:
        frame_filename = os.path.join(frames_folder, f"frame_{frame_count}.jpg")
        cv2.imwrite(frame_filename, frame)

        caption = generate_caption(frame_filename)
        translated_captions = translate_caption(caption)

        # Convert frame timestamp
        time_stamp = frame_count // fps
        minutes, seconds = divmod(time_stamp, 60)
        time_format = f"{minutes:02d}:{seconds:02d}"

        # Save captions
        captions_list.append(f"🕒 [{time_format}]")
        for lang, trans_caption in translated_captions.items():
            captions_list.append(f"🌍 {lang}: {trans_caption}")
        captions_list.append("-" * 50)

    frame_count += 1

cap.release()

# Save captions to file and print in terminal
with open(output_file, "w", encoding="utf-8") as f:
    for caption in captions_list:
        print(caption)  # ✅ Print in terminal
        f.write(caption + "\n")

print(f"\n✅ Captions saved in **{output_file}**")



🖼️ eight.jpg
🌍 English: a snowy road in the mountains
🌍 Tamil 🇮🇳: மலைகளில் ஒரு பனி சாலை
🌍 Kannada 🇮🇳: ಪರ್ವತಗಳಲ್ಲಿ ಹಿಮಭರಿತ ರಸ್ತೆ
🌍 Telugu 🇮🇳: పర్వతాలలో మంచుతో కూడిన రహదారి
🌍 Malayalam 🇮🇳: പർവതങ്ങളിലെ മഞ്ഞുവീഴ്ചയുള്ള റോഡ്
🌍 Hindi 🇮🇳: पहाड़ों में एक बर्फीली सड़क
--------------------------------------------------
🖼️ five.jpg
🌍 English: a bird sitting on a wall in the rain
🌍 Tamil 🇮🇳: மழையில் ஒரு சுவரில் அமர்ந்திருக்கும் பறவை
🌍 Kannada 🇮🇳: ಮಳೆಯಲ್ಲಿ ಗೋಡೆಯ ಮೇಲೆ ಕುಳಿತಿದ್ದ ಹಕ್ಕಿ
🌍 Telugu 🇮🇳: వర్షంలో గోడపై కూర్చున్న పక్షి
🌍 Malayalam 🇮🇳: മഴയിൽ ഒരു മതിൽ ഇരിക്കുന്ന പക്ഷി
🌍 Hindi 🇮🇳: बारिश में एक दीवार पर बैठा एक पक्षी
--------------------------------------------------
🖼️ four.jpg
🌍 English: a couple sitting on a bench looking out at the ocean
🌍 Tamil 🇮🇳: ஒரு பெஞ்சில் உட்கார்ந்திருக்கும் ஒரு ஜோடி கடலுக்கு வெளியே
🌍 Kannada 🇮🇳: ದಂಪತಿಗಳು ಸಾಗರವನ್ನು ನೋಡುತ್ತಿರುವ ಬೆಂಚ್ ಮೇಲೆ ಕುಳಿತಿದ್ದಾರೆ
🌍 Telugu 🇮🇳: ఒక జంట సముద్రం వైపు చూస్తున్న బెంచ్ మీద కూర్చున్నారు
🌍 Malayalam 🇮🇳: സമുദ്രത്തിൽ നോക്കുമ്പോൾ ഒരു ബെഞ്ചിൽ ഇര

In [28]:
import os
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import time

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# User selects content type
print("\nSelect input type:")
print("1️⃣ Images")
print("2️⃣ Videos")
choice = input("Enter your choice (1/2): ")

# User selects language
languages = {
    "1": "English",
    "2": "Tamil",
    "3": "Kannada",
    "4": "Telugu",
    "5": "Malayalam",
    "6": "Hindi",
}
print("\nSelect a language:")
for key, lang in languages.items():
    print(f"{key}. {lang}")

lang_choice = input("\nEnter your language choice: ")

# Set up paths
image_folder = "images"
video_folder = "videos"
frames_folder = "frames"
output_file = "captions_combined.txt"
os.makedirs(frames_folder, exist_ok=True)  # Ensure frames folder exists

captions_list = []

# Process images
if choice == "1":
    print("\n🔹 Processing Images...\n")
    for image_name in os.listdir(image_folder):
        if image_name.lower().endswith((".jpg", ".png", ".jpeg")):
            image_path = os.path.join(image_folder, image_name)
            try:
                image = Image.open(image_path).convert("RGB")
                inputs = processor(image, return_tensors="pt")
                output = model.generate(**inputs)
                caption = processor.decode(output[0], skip_special_tokens=True)
                captions_list.append(f"{image_name}: {caption}")
                print(f"✅ {image_name}: {caption}")
            except Exception as e:
                print(f"❌ Error processing {image_name}: {e}")

# Process videos (Extract frames every 5 seconds)
elif choice == "2":
    print("\n🔹 Processing Video...\n")
    for video_name in os.listdir(video_folder):
        if video_name.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
            video_path = os.path.join(video_folder, video_name)
            cap = cv2.VideoCapture(video_path)
            fps = int(cap.get(cv2.CAP_PROP_FPS))
            frame_interval = 5 * fps  # Capture every 5 seconds

            frame_count = 0
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break

                if frame_count % frame_interval == 0:
                    frame_filename = os.path.join(frames_folder, f"frame_{frame_count}.jpg")
                    cv2.imwrite(frame_filename, frame)
                    print(f"📸 Extracted Frame: {frame_filename}")

                    # Generate caption for frame
                    try:
                        image = Image.open(frame_filename).convert("RGB")
                        inputs = processor(image, return_tensors="pt")
                        output = model.generate(**inputs)
                        caption = processor.decode(output[0], skip_special_tokens=True)
                        captions_list.append(f"{frame_filename}: {caption}")
                        print(f"✅ Caption: {caption}")
                    except Exception as e:
                        print(f"❌ Error processing {frame_filename}: {e}")

                frame_count += 1
            cap.release()

# Save captions
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n".join(captions_list))

print(f"\n✅ Captions saved in **{output_file}**")



Select input type:
1️⃣ Images
2️⃣ Videos


Enter your choice (1/2):  2



Select a language:
1. English
2. Tamil
3. Kannada
4. Telugu
5. Malayalam
6. Hindi



Enter your language choice:  1,3



🔹 Processing Video...


✅ Captions saved in **captions_combined.txt**


In [30]:
import os
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Load model
print("Loading BLIP model...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
print("Model loaded successfully!")

# Check images folder
image_folder = "E:\\images"
if not os.path.exists(image_folder):
    print(f"Error: Folder '{image_folder}' not found!")
else:
    print(f"✅ Found images folder: {image_folder}")

# List available images
image_files = [f for f in os.listdir(image_folder) if f.lower().endswith((".jpg", ".png", ".jpeg"))]
print(f"📂 Found {len(image_files)} images: {image_files}")

# Process first image for testing
if image_files:
    image_path = os.path.join(image_folder, image_files[0])
    print(f"🖼 Processing image: {image_path}")
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    output = model.generate(**inputs)
    caption = processor.decode(output[0], skip_special_tokens=True)
    print(f"✅ Caption: {caption}")
else:
    print("❌ No images found!")


Loading BLIP model...
Model loaded successfully!
✅ Found images folder: E:\images
📂 Found 9 images: ['eight.jpg', 'five.jpg', 'four.jpg', 'nine.jpg', 'one.jpg', 'seven.jpg', 'six.jpg', 'ten.jpg', 'three.jpg']
🖼 Processing image: E:\images\eight.jpg
✅ Caption: a snowy road in the mountains


In [32]:
import os
print(os.listdir("videos"))  # Should list your video files
import cv2
cap = cv2.VideoCapture("E:\\videos/sample.mp4")
if not cap.isOpened():
    print("Error: Cannot open video file!")
else:
    print("Video opened successfully!")


['.ipynb_checkpoints', 'Captures', 'desktop.ini', 'practice1.ipynb', 'Screen Recordings', 'Shortcut to Videos (OneDrive - Personal).lnk']
Error: Cannot open video file!
