In [1]:
!pip install transformers




In [2]:
!pip install torch torchvision torchaudio




In [3]:
!pip install --upgrade ipywidgets




In [4]:
!pip install colorama




In [5]:
!pip install googletrans==4.0.0-rc1




In [6]:
import os
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from googletrans import Translator

# Load BLIP Model
print("🔄 Loading BLIP model...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
translator = Translator()
print("✅ Model loaded successfully!")

# Define folders
image_folder = "E:\\images"
video_folder = "E:\\videos\\frames"
output_file = "captions_combined.txt"

# Language Mapping
languages = {
    "1": ("English", "en"),
    "2": ("Tamil", "ta"),
    "3": ("Kannada", "kn"),
    "4": ("Telugu", "te"),
    "5": ("Malayalam", "ml"),
    "6": ("Hindi", "hi")
}

# Prompt user for language selection
print("\n📌 Choose your language:")
for key, lang in languages.items():
    print(f"{key}. {lang[0]}")

lang_choice = input("\nEnter your choice (e.g., 1,3 for multiple languages): ").split(",")
selected_languages = [languages[l.strip()] for l in lang_choice if l.strip() in languages]

if not selected_languages:
    print("❌ Invalid language selection! Defaulting to English.")
    selected_languages = [("English", "en")]

# Prompt user for processing choice
print("\n📌 Choose what to process:")
print("1. Images only")
print("2. Videos only")
print("3. Both Images and Videos")

process_choice = input("Enter your choice (1/2/3): ").strip()
if process_choice not in ["1", "2", "3"]:
    print("❌ Invalid choice! Defaulting to 'Both Images and Videos'.")
    process_choice = "3"

captions_list = []

def translate_caption(text, selected_languages):
    """ Translates the caption into selected languages. """
    translations = {}
    for lang_name, lang_code in selected_languages:
        try:
            if lang_code == "en":
                translations[lang_name] = text
            else:
                translated_text = translator.translate(text, dest=lang_code).text
                translations[lang_name] = translated_text
        except Exception as e:
            print(f"⚠ Translation failed for {lang_name}: {e}")
            translations[lang_name] = text  # Default to English if translation fails
    return translations

# ** Process Images ** 
if process_choice in ["1", "3"]:
    if not os.path.exists(image_folder):
        print(f"❌ Image folder '{image_folder}' not found!")
    else:
        print(f"\n✅ Found images folder: {image_folder}")
        image_files = [f for f in os.listdir(image_folder) if f.lower().endswith((".jpg", ".png", ".jpeg"))]

        if not image_files:
            print("⚠ No images found in the folder!")

        for image_name in image_files:
            image_path = os.path.join(image_folder, image_name)
            try:
                image = Image.open(image_path).convert("RGB")
                inputs = processor(image, return_tensors="pt")
                output = model.generate(**inputs)
                caption_en = processor.decode(output[0], skip_special_tokens=True)

                translated_captions = translate_caption(caption_en, selected_languages)

                # ✅ Properly format output
                caption_entry = f"🖼 {image_name}:\n" + "\n".join(
                    [f"🌍 {lang_name}: {translated_text}" for lang_name, translated_text in translated_captions.items()]
                ) + "\n"

                captions_list.append(caption_entry)
                print(caption_entry)

            except Exception as e:
                print(f"❌ Error processing {image_name}: {e}")

# ** Process Videos **  
if process_choice in ["2", "3"]:
    if not os.path.exists(video_folder):
        print(f"❌ Video folder '{video_folder}' not found!")
    else:
        print(f"\n✅ Found videos folder: {video_folder}")
        video_files = [f for f in os.listdir(video_folder) if f.lower().endswith((".mp4", ".avi", ".mov"))]

        if not video_files:
            print("⚠ No videos found in the folder!")

        for video_name in video_files:
            video_path = os.path.join(video_folder, video_name)
            
            cap = cv2.VideoCapture(video_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

            if total_frames == 0:
                print(f"❌ Cannot read video: {video_name}")
                continue

            middle_frame = total_frames // 2  # Extract middle frame
            cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame)

            ret, frame = cap.read()
            cap.release()

            if not ret:
                print(f"❌ Failed to extract frame from {video_name}")
                continue

            # Convert frame to PIL Image
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            inputs = processor(image, return_tensors="pt")
            output = model.generate(**inputs)
            caption_en = processor.decode(output[0], skip_special_tokens=True)

            translated_captions = translate_caption(caption_en, selected_languages)

            # ✅ FIX OUTPUT FORMAT
            caption_entry = f"🎥 {video_name}:\n" + "\n".join(
                [f"🌍 {lang_name}: {translated_text}" for lang_name, translated_text in translated_captions.items()]
            ) + "\n"

            captions_list.append(caption_entry)
            print(caption_entry)

# ** Save Captions to File **
with open(output_file, "w", encoding="utf-8") as f:
    for line in captions_list:
        f.write(line + "\n")

print(f"\n✅ Captions saved in **{output_file}**")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


🔄 Loading BLIP model...
✅ Model loaded successfully!

📌 Choose your language:
1. English
2. Tamil
3. Kannada
4. Telugu
5. Malayalam
6. Hindi



Enter your choice (e.g., 1,3 for multiple languages):  4



📌 Choose what to process:
1. Images only
2. Videos only
3. Both Images and Videos


Enter your choice (1/2/3):  1



✅ Found images folder: E:\images
🖼 eight.jpg:
🌍 Telugu: పర్వతాలలో మంచుతో కూడిన రహదారి

🖼 five.jpg:
🌍 Telugu: వర్షంలో గోడపై కూర్చున్న పక్షి

🖼 four.jpg:
🌍 Telugu: ఒక జంట సముద్రం వైపు చూస్తున్న బెంచ్ మీద కూర్చున్నారు

🖼 nine.jpg:
🌍 Telugu: ఒక వ్యక్తి పర్వతాలలో ఒక మార్గం పైకి నడుస్తున్నాడు

🖼 one.jpg:
🌍 Telugu: చెట్టు స్టంప్ మీద ఒక పక్షి

🖼 seven.jpg:
🌍 Telugu: సముద్రం మీద ప్రకాశవంతమైన నారింజ సూర్యాస్తమయం

🖼 six.jpg:
🌍 Telugu: ఒక మహిళ గదిలో కుర్చీలో కూర్చుని

🖼 ten.jpg:
🌍 Telugu: దానిలో ఒక పువ్వుతో ఒక కప్పు

🖼 three.jpg:
🌍 Telugu: రెండు హంసలు నీటిలో ఈత కొడుతున్నాయి


✅ Captions saved in **captions_combined.txt**
