In [1]:
import torch  
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
!pip install PIL
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar
import time
import pyttsx3  # For reading out the story
from googletrans import Translator  # For translation

# Initialize the BLIP captioning model and processor
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load the GPT-2 model for story generation
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Initialize the text-to-speech engine
tts_engine = pyttsx3.init()

# Initialize the translator
translator = Translator()

# Function to load and preprocess an image
def load_image(path):
    image = Image.open(path).convert("RGB")
    return image

# Generate a caption for the image
def generate_caption(image):
    inputs = blip_processor(image, return_tensors="pt")
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

# Generate a story based on the caption
def generate_story(caption, max_length=100):
    prompt = f"{caption}. Once upon a time, "
    inputs = gpt_tokenizer.encode(prompt, return_tensors="pt")
    outputs = gpt_model.generate(
        inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.7
    )
    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return story

# Translate the story to Kannada
def translate_to_kannada(text):
    translation = translator.translate(text, src="en", dest="kn")
    return translation.text

# Main function to create a story from an image
def image_to_story(path):
    start_time = time.time()  # Start timing
    image = load_image(path)
    caption = generate_caption(image)
    story = generate_story(caption)
    end_time = time.time()  # End timing

    # Calculate efficiency metrics
    time_taken = end_time - start_time
    story_length = len(story.split())  # Number of words in the story
    efficiency = story_length / time_taken if time_taken > 0 else 0

    return caption, story, time_taken, efficiency

# Function to handle the image upload and display the story
def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        # Display the selected image
        img = Image.open(file_path).resize((300, 300))
        img = ImageTk.PhotoImage(img)
        image_label.config(image=img)
        image_label.image = img

        # Generate caption, story, and efficiency metrics
        caption, story, time_taken, efficiency = image_to_story(file_path)

        # Translate the story to Kannada
        story_in_kannada = translate_to_kannada(story)

        # Display the caption and story
        caption_label.config(text=f"Caption: {caption}")
        story_text.delete("1.0", tk.END)
        story_text.insert(tk.END, story)

        # Display the translated story
        kannada_text.delete("1.0", tk.END)
        kannada_text.insert(tk.END, story_in_kannada)

        # Display efficiency metrics
        efficiency_label.config(
            text=f"Time Taken: {time_taken:.2f} seconds\nStory Length: {len(story.split())} words\nEfficiency: {efficiency:.2f} words/second"
        )

# Function to read out the English story
def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        tts_engine.say(story)
        tts_engine.runAndWait()

# Function to read out the Kannada story
def read_kannada_story():
    story_in_kannada = kannada_text.get("1.0", tk.END).strip()
    if story_in_kannada:
        tts_engine.say(story_in_kannada)
        tts_engine.runAndWait()

# Set up the GUI window
window = tk.Tk()
window.title("Image to Story Generator with Kannada Translation")
window.geometry("600x800")

# Create a scrollable canvas
main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)

scrollable_frame.bind(
    "<Configure>",
    lambda e: canvas.configure(scrollregion=canvas.bbox("all"))
)

canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

# Add widgets to the scrollable frame
image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

caption_label = Label(scrollable_frame, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=60, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=60, height=10)
kannada_text.pack(pady=10)

efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center")
efficiency_label.pack(pady=10)

read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

# Run the GUI loop
window.mainloop()

ModuleNotFoundError: No module named 'torch'

In [8]:

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar
import time
import pyttsx3
from deep_translator import GoogleTranslator

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

tts_engine = pyttsx3.init()
translator = GoogleTranslator(source="en", target="kn")

# Load and preprocess image
def load_image(path):
    image = Image.open(path).convert("RGB")
    return image

# Generate caption for the image
def generate_caption(image):
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

# Generate a story based on an enriched prompt
def generate_story(caption, max_length=150):
    # Enrich the caption for better story context
    enriched_prompt = f"The image shows {caption}. In this scene, "
    enriched_prompt += "a magical story unfolds involving unexpected events, emotions, and resolutions."
    inputs = gpt_tokenizer.encode(enriched_prompt, return_tensors="pt", padding=True).to(device)

    outputs = gpt_model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.7,
    )
    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return story

# Translate the story to Kannada
def translate_to_kannada(text):
    return translator.translate(text)

# Main function to generate caption, story, and efficiency metrics
def image_to_story(path):
    start_time = time.time()
    image = load_image(path)
    caption = generate_caption(image)
    story = generate_story(caption)
    time_taken = time.time() - start_time
    story_length = len(story.split())
    efficiency = story_length / time_taken if time_taken > 0 else 0
    return caption, story, time_taken, efficiency

# Handle the upload image process
def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        caption, story, time_taken, efficiency = image_to_story(file_path)
        kannada_story = translate_to_kannada(story)

        caption_label.config(text=f"Caption: {caption}")
        story_text.delete("1.0", tk.END)
        story_text.insert(tk.END, story)
        kannada_text.delete("1.0", tk.END)
        kannada_text.insert(tk.END, kannada_story)
        efficiency_label.config(
            text=f"Time Taken: {time_taken:.2f}s\nStory Length: {len(story.split())} words\nEfficiency: {efficiency:.2f} words/s"
        )

# Read English story
def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        tts_engine.say(story)
        tts_engine.runAndWait()

# Read Kannada story
def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        tts_engine.say(kannada_story)
        tts_engine.runAndWait()

# GUI Setup
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("600x800")

main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)

scrollable_frame.bind(
    "<Configure>",
    lambda e: canvas.configure(scrollregion=canvas.bbox("all"))
)

canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

caption_label = Label(scrollable_frame, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=60, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=60, height=10)
kannada_text.pack(pady=10)

efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center")
efficiency_label.pack(pady=10)

read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

window.mainloop()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [1]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar
import time
import pyttsx3
from deep_translator import GoogleTranslator
import sys
import os
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")

tts_engine = pyttsx3.init()

def speak_kannada(text, filename="output.mp3"):
    tts = gTTS(text=text, lang="kn")
    tts.save(filename)
    audio = AudioSegment.from_file(filename, format="mp3")
    play(audio)

# Load and preprocess image
def load_image(path):
    image = Image.open(path).convert("RGB")
    return image

# Generate caption for the image
def generate_caption(image):
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

# Generate a story based on an enriched prompt
def generate_story(caption, max_length=150):
    enriched_prompt = f"The image shows {caption}. In this scene, a magical story unfolds involving unexpected events, emotions, and resolutions."
    inputs = gpt_tokenizer.encode(enriched_prompt, return_tensors="pt", padding=True).to(device)
    
    outputs = gpt_model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.7,
    )
    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return story

# Translate the story to Kannada
def translate_to_kannada(text):
    return translator.translate(text)

# Main function to generate caption, story, and efficiency metrics
def image_to_story(path):
    start_time = time.time()
    image = load_image(path)
    caption = generate_caption(image)
    story = generate_story(caption)
    time_taken = time.time() - start_time
    story_length = len(story.split())
    efficiency = story_length / time_taken if time_taken > 0 else 0
    return caption, story, time_taken, efficiency

# Handle the upload image process
def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        caption, story, time_taken, efficiency = image_to_story(file_path)
        kannada_story = translate_to_kannada(story)

        caption_label.config(text=f"Caption: {caption}")
        story_text.delete("1.0", tk.END)
        story_text.insert(tk.END, story)
        kannada_text.delete("1.0", tk.END)
        kannada_text.insert(tk.END, kannada_story)
        efficiency_label.config(
            text=f"Time Taken: {time_taken:.2f}s\nStory Length: {len(story.split())} words\nEfficiency: {efficiency:.2f} words/s"
        )

# Read English story
def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        tts_engine.say(story)
        tts_engine.runAndWait()

# Read Kannada story
def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        speak_kannada(kannada_story)

# GUI Setup
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("600x800")

main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)

scrollable_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

caption_label = Label(scrollable_frame, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=60, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=60, height=10)
kannada_text.pack(pady=10)

efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center")
efficiency_label.pack(pady=10)

read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

window.mainloop()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [31]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar, Listbox
import time
import pyttsx3
from deep_translator import GoogleTranslator
import random
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")
tts_engine = pyttsx3.init()

# Dictionary to store efficiency metrics
efficiency_metrics = {
    "image_processing_time": 0,
    "story_generation_time": 0,
    "translation_time": 0,
    "audio_generation_time": 0
}

def update_efficiency_metrics():
    """
    Updates efficiency metrics on the GUI.
    """
    metrics_text = (
        f"📸 Image Processing: {efficiency_metrics['image_processing_time']:.2f} sec\n"
        f"📖 Story Generation: {efficiency_metrics['story_generation_time']:.2f} sec\n"
        f"🌍 Translation: {efficiency_metrics['translation_time']:.2f} sec\n"
        f"🔊 Audio Generation: {efficiency_metrics['audio_generation_time']:.2f} sec"
    )
    efficiency_label.config(text=metrics_text)

def speak_kannada(text, filename="output.mp3"):
    tts = gTTS(text=text, lang="kn")
    tts.save(filename)
    audio = AudioSegment.from_file(filename, format="mp3")
    play(audio)

def load_image(path):
    return Image.open(path).convert("RGB")

def generate_caption(image):
    start_time = time.time()
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    efficiency_metrics["image_processing_time"] = time.time() - start_time
    update_efficiency_metrics()
    return caption

def generate_story(caption, max_length=150):
    start_time = time.time()

    story_styles = [
        "A magical adventure unfolds where",
        "A suspenseful tale emerges involving",
        "A heartwarming story takes place as",
        "An unexpected journey begins when",
        "A dramatic event changes everything when",
        "A mysterious secret is revealed when",
        "A brave hero faces a great challenge when",
        "A thrilling discovery is made as",
        "A peaceful moment turns into an epic tale when",
        "A legendary event occurs as"
    ]
    random_prompt = random.choice(story_styles)
    enriched_prompt = f"The image shows {caption}. {random_prompt} unexpected events, emotions, and resolutions."

    inputs = gpt_tokenizer.encode_plus(enriched_prompt, return_tensors="pt", max_length=50, truncation=True).to(device)
    outputs = gpt_model.generate(
        inputs["input_ids"], max_length=max_length, num_return_sequences=1,
        no_repeat_ngram_size=3, temperature=1.2, top_k=50, top_p=0.90,
        pad_token_id=gpt_tokenizer.eos_token_id
    )

    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    efficiency_metrics["story_generation_time"] = time.time() - start_time
    update_efficiency_metrics()
    return story

def generate_multiple_stories(caption, num_stories=10):
    return [generate_story(caption) for _ in range(num_stories)]

def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        caption = generate_caption(load_image(file_path))
        stories = generate_multiple_stories(caption)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)

def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        start_time = time.time()
        tts_engine.say(story)
        tts_engine.runAndWait()
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        start_time = time.time()
        speak_kannada(kannada_story)
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

# GUI Setup
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("900x800")

main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)
scrollable_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

story_listbox = Listbox(scrollable_frame, height=10)
story_listbox.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=70, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=70, height=10)
kannada_text.pack(pady=10)

# 🏆 Efficiency Metrics Display
efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center", font=("Arial", 12, "bold"))
efficiency_label.pack(pady=10)

read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

window.mainloop()


In [10]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar, Listbox
import time
import pyttsx3
from deep_translator import GoogleTranslator
import random
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
import cv2

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")
tts_engine = pyttsx3.init()

# Dictionary to store efficiency metrics
efficiency_metrics = {
    "image_processing_time": 0,
    "story_generation_time": 0,
    "translation_time": 0,
    "audio_generation_time": 0
}

def update_efficiency_metrics():
    """
    Updates efficiency metrics on the GUI.
    """
    metrics_text = (
        f"📸 Image Processing: {efficiency_metrics['image_processing_time']:.2f} sec\n"
        f"📖 Story Generation: {efficiency_metrics['story_generation_time']:.2f} sec\n"
        f"🌍 Translation: {efficiency_metrics['translation_time']:.2f} sec\n"
        f"🔊 Audio Generation: {efficiency_metrics['audio_generation_time']:.2f} sec"
    )
    efficiency_label.config(text=metrics_text)

def speak_kannada(text, filename="output.mp3"):
    tts = gTTS(text=text, lang="kn")
    tts.save(filename)
    audio = AudioSegment.from_file(filename, format="mp3")
    play(audio)

def load_image(path):
    return Image.open(path).convert("RGB")

def generate_caption(image):
    start_time = time.time()
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    efficiency_metrics["image_processing_time"] = time.time() - start_time
    update_efficiency_metrics()
    return caption

def generate_story(caption, max_length=150):
    start_time = time.time()

    story_styles = [
        "A magical adventure unfolds where",
        "A suspenseful tale emerges involving",
        "A heartwarming story takes place as",
        "An unexpected journey begins when",
        "A dramatic event changes everything when",
        "A mysterious secret is revealed when",
        "A brave hero faces a great challenge when",
        "A thrilling discovery is made as",
        "A peaceful moment turns into an epic tale when",
        "A legendary event occurs as"
    ]
    random_prompt = random.choice(story_styles)
    enriched_prompt = f"The image shows {caption}. {random_prompt} unexpected events, emotions, and resolutions."

    inputs = gpt_tokenizer.encode_plus(enriched_prompt, return_tensors="pt", max_length=50, truncation=True).to(device)
    outputs = gpt_model.generate(
        inputs["input_ids"], max_length=max_length, num_return_sequences=1,
        no_repeat_ngram_size=3, temperature=1.2, top_k=50, top_p=0.90,
        pad_token_id=gpt_tokenizer.eos_token_id
    )

    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    efficiency_metrics["story_generation_time"] = time.time() - start_time
    update_efficiency_metrics()
    return story

def generate_multiple_stories(caption, num_stories=10):
    return [generate_story(caption) for _ in range(num_stories)]

def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        caption = generate_caption(load_image(file_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)
        
def capture_image():
    cap = cv2.VideoCapture(0)  # Open the webcam
    ret, frame = cap.read()  # Capture a frame
    cap.release()  # Release the webcam
    
    if ret:
        img_path = "captured_image.jpg"
        cv2.imwrite(img_path, frame)  # Save the captured image

        # Load and display the captured image
        img = Image.open(img_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        # Process the image
        caption = generate_caption(load_image(img_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)


def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        start_time = time.time()
        tts_engine.say(story)
        tts_engine.runAndWait()
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        start_time = time.time()
        speak_kannada(kannada_story)
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

# GUI Setup
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("900x800")

main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)
scrollable_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

capture_button = Button(scrollable_frame, text="Capture Image", command=capture_image)
capture_button.pack()

caption_label = Label(scrollable_frame, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)

story_listbox = Listbox(scrollable_frame, height=10)
story_listbox.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=70, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=70, height=10)
kannada_text.pack(pady=10)

# 🏆 Efficiency Metrics Display
efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center", font=("Arial", 12, "bold"))
efficiency_label.pack(pady=10)

read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

window.mainloop()

In [15]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar, Listbox
import time
import pyttsx3
from deep_translator import GoogleTranslator
import random
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
import cv2
import os 

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")
tts_engine = pyttsx3.init()

# Dictionary to store efficiency metrics
efficiency_metrics = {
    "image_processing_time": 0,
    "story_generation_time": 0,
    "translation_time": 0,
    "audio_generation_time": 0
}

def update_efficiency_metrics():
    """
    Updates efficiency metrics on the GUI.
    """
    metrics_text = (
        f"📸 Image Processing: {efficiency_metrics['image_processing_time']:.2f} sec\n"
        f"📖 Story Generation: {efficiency_metrics['story_generation_time']:.2f} sec\n"
        f"🌍 Translation: {efficiency_metrics['translation_time']:.2f} sec\n"
        f"🔊 Audio Generation: {efficiency_metrics['audio_generation_time']:.2f} sec"
    )
    efficiency_label.config(text=metrics_text)

def speak_kannada(text, filename="output.mp3"):
    tts = gTTS(text=text, lang="kn")
    tts.save(filename)
    audio = AudioSegment.from_file(filename, format="mp3")
    play(audio)

def load_image(path):
    return Image.open(path).convert("RGB")

def generate_caption(image):
    start_time = time.time()
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    efficiency_metrics["image_processing_time"] = time.time() - start_time
    update_efficiency_metrics()
    return caption

def generate_story(caption, max_length=150):
    start_time = time.time()

    story_styles = [
        "A magical adventure unfolds where",
        "A suspenseful tale emerges involving",
        "A heartwarming story takes place as",
        "An unexpected journey begins when",
        "A dramatic event changes everything when",
        "A mysterious secret is revealed when",
        "A brave hero faces a great challenge when",
        "A thrilling discovery is made as",
        "A peaceful moment turns into an epic tale when",
        "A legendary event occurs as"
    ]
    random_prompt = random.choice(story_styles)
    enriched_prompt = f"The image shows {caption}. {random_prompt} unexpected events, emotions, and resolutions."

    inputs = gpt_tokenizer.encode_plus(enriched_prompt, return_tensors="pt", max_length=50, truncation=True).to(device)
    outputs = gpt_model.generate(
        inputs["input_ids"], max_length=max_length, num_return_sequences=1,
        no_repeat_ngram_size=3, temperature=1.2, top_k=50, top_p=0.90,
        pad_token_id=gpt_tokenizer.eos_token_id
    )

    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    efficiency_metrics["story_generation_time"] = time.time() - start_time
    update_efficiency_metrics()
    return story

def generate_multiple_stories(caption, num_stories=10):
    return [generate_story(caption) for _ in range(num_stories)]

def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        caption = generate_caption(load_image(file_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        print("Generated Stories:", stories)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)
        
        save_button = Button(scrollable_frame, text="Save All Stories", command=lambda: save_all_stories(stories))
        save_button.pack(pady=5)

def capture_image():
    cap = cv2.VideoCapture(0)  # Open the webcam
    ret, frame = cap.read()  # Capture a frame
    cap.release()  # Release the webcam
    
    if ret:
        img_path = "captured_image.jpg"
        cv2.imwrite(img_path, frame)  # Save the captured image

        # Load and display the captured image
        img = Image.open(img_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        # Process the image
        caption = generate_caption(load_image(img_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)


def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        start_time = time.time()
        tts_engine.say(story)
        tts_engine.runAndWait()
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        start_time = time.time()
        speak_kannada(kannada_story)
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def save_story(story, kannada_story, story_index):
    """
    Saves the generated stories in one folder and audio files in another.
    """
    # Create directories to store the stories and audio files if they don't exist
    story_dir = "generated_stories"
    audio_dir = "generated_audio"
    os.makedirs(story_dir, exist_ok=True)
    os.makedirs(audio_dir, exist_ok=True)
    
    # Define file paths for text files
    english_text_path = os.path.join(story_dir, f"story_{story_index}_english.txt")
    kannada_text_path = os.path.join(story_dir, f"story_{story_index}_kannada.txt")
    
    # Define file paths for audio files
    english_audio_path = os.path.join(audio_dir, f"story_{story_index}_english.mp3")
    kannada_audio_path = os.path.join(audio_dir, f"story_{story_index}_kannada.mp3")
    
    # Save text files
    with open(english_text_path, "w", encoding="utf-8") as eng_file:
        eng_file.write(story)
    
    with open(kannada_text_path, "w", encoding="utf-8") as kan_file:
        kan_file.write(kannada_story)
    
    # Save audio files
    tts_eng = gTTS(text=story, lang="en")
    tts_eng.save(english_audio_path)
    
    tts_kan = gTTS(text=kannada_story, lang="kn")
    tts_kan.save(kannada_audio_path)

def save_all_stories(stories):
    """
    Saves all generated stories along with their Kannada translations and audio.
    """
    for i, story in enumerate(stories):
        start_time = time.time()
        kannada_story = translator.translate(story)
        efficiency_metrics["translation_time"] = time.time() - start_time
        update_efficiency_metrics()
        save_story(story, kannada_story, i + 1)


# GUI Setup
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("900x800")

main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)
scrollable_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

capture_button = Button(scrollable_frame, text="Capture Image", command=capture_image)
capture_button.pack()

caption_label = Label(scrollable_frame, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)

story_listbox = Listbox(scrollable_frame, height=10)
story_listbox.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=70, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=70, height=10)
kannada_text.pack(pady=10)

# 🏆 Efficiency Metrics Display
efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center", font=("Arial", 12, "bold"))
efficiency_label.pack(pady=10)

read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

window.mainloop()

Generated Stories: ['The image shows a tiger in the woods. A heartwarming story takes place as unexpected events, emotions, and resolutions.\n\nThe story begins with a tiger named "Tiger" who is a member of the "Tigers of the Forest" (Tigers in the Forest). He is a young man who has been living in the forest for a long time. He is the only tiger in his family. He has been a member for over a year and has been in the forests for over two years. He was a member in the past and has always been a tiger. He had a heart of gold and was a tiger\'s protector. He wanted to be a tiger and was willing to sacrifice his life for the tiger.\n.\n', "The image shows a tiger in the woods. A dramatic event changes everything when unexpected events, emotions, and resolutions.\n\nThe tiger is a very powerful animal. It is a powerful animal that can be used to manipulate people. It can be a powerful creature that can manipulate people and people can be manipulated by it. It's a powerful beast that can cont

In [1]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar, Listbox
import time
import pyttsx3
from deep_translator import GoogleTranslator
import random
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
import cv2
import os 

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")
tts_engine = pyttsx3.init()

# Dictionary to store efficiency metrics
efficiency_metrics = {
    "image_processing_time": 0,
    "story_generation_time": 0,
    "translation_time": 0,
    "audio_generation_time": 0
}

def update_efficiency_metrics():
    """
    Updates efficiency metrics on the GUI.
    """
    metrics_text = (
        f"📸 Image Processing: {efficiency_metrics['image_processing_time']:.2f} sec\n"
        f"📖 Story Generation: {efficiency_metrics['story_generation_time']:.2f} sec\n"
        f"🌍 Translation: {efficiency_metrics['translation_time']:.2f} sec\n"
        f"🔊 Audio Generation: {efficiency_metrics['audio_generation_time']:.2f} sec"
    )
    efficiency_label.config(text=metrics_text)

def speak_kannada(text, filename="output.mp3"):
    tts = gTTS(text=text, lang="kn")
    tts.save(filename)
    audio = AudioSegment.from_file(filename, format="mp3")
    play(audio)

def load_image(path):
    return Image.open(path).convert("RGB")

def generate_caption(image):
    start_time = time.time()
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    efficiency_metrics["image_processing_time"] = time.time() - start_time
    update_efficiency_metrics()
    return caption

def generate_story(caption, max_length=150):
    start_time = time.time()

    story_styles = [
        "A magical adventure unfolds where",
        "A suspenseful tale emerges involving",
        "A heartwarming story takes place as",
        "An unexpected journey begins when",
        "A dramatic event changes everything when",
        "A mysterious secret is revealed when",
        "A brave hero faces a great challenge when",
        "A thrilling discovery is made as",
        "A peaceful moment turns into an epic tale when",
        "A legendary event occurs as"
    ]
    random_prompt = random.choice(story_styles)
    enriched_prompt = f"The image shows {caption}. {random_prompt} unexpected events, emotions, and resolutions."

    inputs = gpt_tokenizer.encode_plus(enriched_prompt, return_tensors="pt", max_length=50, truncation=True).to(device)
    outputs = gpt_model.generate(
        inputs["input_ids"], max_length=max_length, num_return_sequences=1,
        no_repeat_ngram_size=3, temperature=1.2, top_k=50, top_p=0.90,
        pad_token_id=gpt_tokenizer.eos_token_id
    )

    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    efficiency_metrics["story_generation_time"] = time.time() - start_time
    update_efficiency_metrics()
    return story

def generate_multiple_stories(caption, num_stories=10):
    return [generate_story(caption) for _ in range(num_stories)]

def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        caption = generate_caption(load_image(file_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        save_all_stories(stories)


        print("Generated Stories:", stories)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)
        

def capture_image():
    cap = cv2.VideoCapture(0)  # Open the webcam
    ret, frame = cap.read()  # Capture a frame
    cap.release()  # Release the webcam
    
    if ret:
        img_path = "captured_image.jpg"
        cv2.imwrite(img_path, frame)  # Save the captured image

        # Load and display the captured image
        img = Image.open(img_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        # Process the image
        caption = generate_caption(load_image(img_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        save_all_stories(stories)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)


def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        start_time = time.time()
        tts_engine.say(story)
        tts_engine.runAndWait()
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        start_time = time.time()
        speak_kannada(kannada_story)
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def save_story(story, kannada_story, story_index):
    """
    Saves the generated stories (English & Kannada) in a single file,
    while storing the audio files separately.
    """
    # Create directories if they don't exist
    story_dir = "generated_stories"
    audio_dir = "generated_audio"
    os.makedirs(story_dir, exist_ok=True)
    os.makedirs(audio_dir, exist_ok=True)
    
    # Define file path for the text file (single file for both languages)
    story_text_path = os.path.join(story_dir, f"story_{story_index}.txt")
    
    # Define file paths for audio files
    english_audio_path = os.path.join(audio_dir, f"story_{story_index}_english.mp3")
    kannada_audio_path = os.path.join(audio_dir, f"story_{story_index}_kannada.mp3")
    
    # Save story (English & Kannada in a single file)
    with open(story_text_path, "w", encoding="utf-8") as file:
        file.write(f"📖 Story {story_index} (English):\n")
        file.write(story + "\n\n")
        file.write(f"🌍 Story {story_index} (Kannada):\n")
        file.write(kannada_story + "\n")
    
    # Save audio files
    tts_eng = gTTS(text=story, lang="en")
    tts_eng.save(english_audio_path)
    
    tts_kan = gTTS(text=kannada_story, lang="kn")
    tts_kan.save(kannada_audio_path)

    print(f"✅ Story {story_index} saved at {story_text_path}")
    print(f"🔊 English Audio: {english_audio_path}")
    print(f"🔊 Kannada Audio: {kannada_audio_path}")


def save_all_stories(stories):
    """
    Saves all generated stories with their Kannada translations in a single file per story.
    """
    for i, story in enumerate(stories):
        start_time = time.time()
        kannada_story = translator.translate(story)  # Translate to Kannada
        efficiency_metrics["translation_time"] = time.time() - start_time
        update_efficiency_metrics()
        save_story(story, kannada_story, i + 1)



# GUI Setup
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("900x800")

main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)
scrollable_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

capture_button = Button(scrollable_frame, text="Capture Image", command=capture_image)
capture_button.pack()

caption_label = Label(scrollable_frame, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)

story_listbox = Listbox(scrollable_frame, height=10)
story_listbox.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=70, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=70, height=10)
kannada_text.pack(pady=10)

# 🏆 Efficiency Metrics Display
efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center", font=("Arial", 12, "bold"))
efficiency_label.pack(pady=10)

read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

window.mainloop()



In [None]:
import torch
import numpy as np
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, messagebox, Scrollbar, Frame, Label, Button, Text, Listbox, Spinbox , ACTIVE , ttk
import time
import pyttsx3
import matplotlib.pyplot as plt
from deep_translator import GoogleTranslator 
import cv2
import random
import threading
import os
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
import sounddevice as sd
import wave
import pygame




# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")
tts_engine = pyttsx3.init()

# Dictionary to store efficiency metrics
efficiency_metrics = {
    "image_processing_time": 0,
    "story_generation_time": 0,
    "translation_time": 0,
    "audio_generation_time": 0
}

def update_efficiency_metrics():
    """
    Updates efficiency metrics on the GUI.
    """
    metrics_text = (
        f"📸 Image Processing: {efficiency_metrics['image_processing_time']:.2f} sec\n"
        f"📖 Story Generation: {efficiency_metrics['story_generation_time']:.2f} sec\n"
        f"🌍 Translation: {efficiency_metrics['translation_time']:.2f} sec\n"
        f"🔊 Audio Generation: {efficiency_metrics['audio_generation_time']:.2f} sec"
    )
    efficiency_label.config(text=metrics_text)

SAVE_DIR = "saved_audio"
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

stories = []
translator = GoogleTranslator(source="auto", target="kn")
engine = pyttsx3.init()
# Define paths for saving stories and audio
STORY_SAVE_PATH = "generated_stories"
AUDIO_SAVE_PATH = "generated_audio"

# Ensure directories exist
os.makedirs(STORY_SAVE_PATH, exist_ok=True)
os.makedirs(AUDIO_SAVE_PATH, exist_ok=True)

LOCAL_IMAGE_FOLDER = "C:\\Users\\ansl6\\Downloads\\NAVEEN DS PROJECTS\\IR FINAL\\IMAGES"

def speak_kannada(text, filename="output.mp3"):
    tts = gTTS(text=text, lang="kn")
    tts.save(filename)
    audio = AudioSegment.from_file(filename, format="mp3")
    play(audio)

def load_image(path):
    return Image.open(path).convert("RGB")

def generate_caption(image):
    start_time = time.time()
    inputs = blip_processor(image, return_tensors="pt").to(device)
    with torch.no_grad():
        caption_ids = blip_model.generate(**inputs)
    efficiency_metrics["image_processing_time"] = time.time() - start_time
    update_efficiency_metrics()
    return blip_processor.decode(caption_ids[0], skip_special_tokens=True)

story_prompts = [
    "An unexpected journey begins when...", "A mysterious event changes everything...",
    "A hero rises in the face of danger...", "A magical world unfolds before them...",
    "A secret from the past resurfaces...", "A race against time begins...",
    "A lost artifact holds the key to...", "A twist of fate leads them to...",
    "A battle between good and evil ensues...", "An ancient prophecy reveals the truth..."
]

def generate_stories():
    start_time = time.time()

    global stories  
    caption = caption_label.cget("text").replace("Caption: ", "").strip()
    
    if not caption:
        messagebox.showerror("Error", "No caption generated. Upload or capture an image first.")
        return

    count = int(story_count_spinbox.get())
    stories.clear()  # Clear previous stories before generating new ones

    def generate_story_thread():
        new_stories = []  # Temporary list to hold new stories
        for _ in range(count):
            prompt = f"{random.choice(story_prompts)} {caption}"
            inputs = gpt_tokenizer.encode_plus(prompt, return_tensors="pt", max_length=100, truncation=True).to(device)
            
            with torch.no_grad():
                outputs = gpt_model.generate(
                    inputs["input_ids"], 
                    max_length=200,  
                    temperature=0.9,  
                    top_k=50,  
                    top_p=0.95,  
                    repetition_penalty=1.3
                )

            story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            if story not in new_stories:  
                new_stories.append(story)  

        stories.extend(new_stories)  # Update global stories list
        window.after(0, update_story_listbox, new_stories)  # Sync with UI

        efficiency_metrics["story_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

    threading.Thread(target=generate_story_thread, daemon=True).start()


def update_story_listbox(stories):
    story_listbox.delete(0, tk.END)
    for i, story in enumerate(stories):
        story_listbox.insert(tk.END, f"Story {i + 1}")
    
    def on_story_select(event):
        selected_index = story_listbox.curselection()
        if selected_index:
            index = selected_index[0]
            story_text.delete("1.0", tk.END)
            story_text.insert(tk.END, stories[index])
            kannada_text.delete("1.0", tk.END)
            kannada_text.insert(tk.END, translator.translate(stories[index]))
    
    story_listbox.bind("<<ListboxSelect>>", on_story_select)



def capture_image():
    cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)  # Use DirectShow for Windows

    if not cap.isOpened():
        messagebox.showerror("Error", "Failed to access the camera.")
        return

    time.sleep(2)  # Allow camera to adjust exposure

    for _ in range(10):  # Try capturing multiple times
        ret, frame = cap.read()
        if ret and np.mean(frame) > 10:  # Ensure image is not black
            break
        time.sleep(0.1)  # Small delay before retrying

    cap.release()  # Release the webcam

    if not ret or np.mean(frame) <= 10:
        messagebox.showerror("Error", "Failed to capture a clear image.")
        return

    img_path = "captured_image.jpg"
    cv2.imwrite(img_path, frame)  # Save image

    # Display image in UI
    img = Image.open(img_path).resize((300, 300))
    img_tk = ImageTk.PhotoImage(img)
    image_label.config(image=img_tk)
    image_label.image = img_tk

    # Process the image
    caption = generate_caption(load_image(img_path))
    caption_label.config(text=f"Caption: {caption}")
    stories = generate_stories(caption)

    save_story(stories)

    story_listbox.delete(0, tk.END)
    for i in range(10):
        story_listbox.insert(tk.END, f"Story {i + 1}")



    def on_story_select(event):
        selected_index = story_listbox.curselection()
        if selected_index:
            index = selected_index[0]
            story = stories[index]

            start_time = time.time()
            kannada_story = translator.translate(story)
            efficiency_metrics["translation_time"] = time.time() - start_time
            update_efficiency_metrics()

            story_text.delete("1.0", tk.END)
            story_text.insert(tk.END, story)

            kannada_text.delete("1.0", tk.END)
            kannada_text.insert(tk.END, kannada_story)

    story_listbox.bind("<<ListboxSelect>>", on_story_select)

# Upload image
def upload_image(source):
    file_types = [("Image Files", "*.png;*.jpg;*.jpeg;*.bmp;*.gif;*.tiff;*.avif")]

    if source == "device":
        file_path = filedialog.askopenfilename(title="Select an Image from Device", filetypes=file_types)
    else:
        file_path = filedialog.askopenfilename(initialdir=LOCAL_IMAGE_FOLDER, title="Select an Image from Project Folder", filetypes=file_types)

    if file_path:
        process_image(file_path)

def process_image(img_path):
    img = Image.open(img_path).resize((300, 300))
    img_tk = ImageTk.PhotoImage(img)
    image_label.config(image=img_tk)
    image_label.image = img_tk
    
    start_time = time.time()
    caption_label.config(text=f"Caption: {generate_caption(load_image(img_path))}")
    efficiency_metrics["translation_time"] = time.time() - start_time
    update_efficiency_metrics()

def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        start_time = time.time()
        tts_engine.say(story)
        tts_engine.runAndWait()
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def read_kannada_story():
    story_in_kannada = kannada_text.get("1.0", tk.END).strip()
    
    if story_in_kannada:
        start_time = time.time()
        
        # Define the filename
        filename = os.path.join(SAVE_DIR, "kannada_story.mp3")
        
        # Convert text to speech and save
        tts = gTTS(text=story_in_kannada, lang="kn")
        tts.save(filename)
        
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()
        
        # Play the saved audio file
        pygame.mixer.init()
        pygame.mixer.music.load(filename)
        pygame.mixer.music.play()

        while pygame.mixer.music.get_busy():  # Wait until the audio finishes playing
            time.sleep(0.1)

def save_audio(language):
    selected_story = story_listbox.get(ACTIVE)  # Get the selected story
    if not selected_story:
        messagebox.showerror("Error", "No story selected.")
        return
    messagebox.showinfo("Saving Audio", f"Saving audio in {language}...")  # Replace with actual saving logic

def save_story(language):
    selected_story = story_listbox.get(ACTIVE)  # Get the selected story
    if not selected_story:
        messagebox.showerror("Error", "No story selected.")
        return
    messagebox.showinfo("Saving Story", f"Saving story in {language}...")  # Replace with actual saving logic


def estimate_recording_duration(text):
    """Estimate duration based on story length (assuming 150 words per minute)."""
    words_per_minute = 150
    word_count = len(text.split())
    return word_count / words_per_minute * 60  # Convert minutes to seconds

def text_to_speech(text, filename):
    """Converts text to speech and saves as an audio file."""
    engine.save_to_file(text, filename)
    engine.runAndWait()

def text_to_speech(text, filename, lang="en"):
    """Convert text to speech and save as an audio file."""
    tts = gTTS(text=text, lang=lang)
    mp3_filename = filename.replace(".wav", ".mp3")  # gTTS only supports MP3
    tts.save(mp3_filename)
    
    # Convert MP3 to WAV if needed
    if filename.endswith(".wav"):
        audio = AudioSegment.from_mp3(mp3_filename)
        audio.export(filename, format="wav")

def record_audio(language):
    """Records and saves English, Kannada, or both story audios."""
    selected_index = story_listbox.curselection()
    if not selected_index:
        messagebox.showerror("Error", "No story selected.")
        return

    selected_index = selected_index[0]
    selected_story = stories[selected_index]

    if language in ["english", "both"]:
        english_filename = os.path.join(AUDIO_SAVE_PATH, f"story_english_{selected_index}.wav")
        text_to_speech(selected_story, english_filename, lang="en")
        messagebox.showinfo("Success", f"English audio saved:\n{english_filename}")

    if language in ["kannada", "both"]:
        translator = GoogleTranslator(source="auto", target="kn")
        kannada_story = translator.translate(selected_story)
        
        if not kannada_story.strip():
            messagebox.showerror("Error", "Kannada translation failed.")
            return
        
        kannada_filename = os.path.join(AUDIO_SAVE_PATH, f"story_kannada_{selected_index}.wav")
        text_to_speech(kannada_story, kannada_filename, lang="kn")
        messagebox.showinfo("Success", f"Kannada audio saved:\n{kannada_filename}")

def save_story(language):
    """Saves the selected story in English, Kannada, or both."""
    global stories  

    selected_index = story_listbox.curselection()
    if not selected_index:
        messagebox.showerror("Error", "No story selected.")
        return

    selected_index = selected_index[0]  

    if selected_index >= len(stories):  
        messagebox.showerror("Error", "Error retrieving the full story. Please regenerate the stories.")
        return

    selected_story = stories[selected_index]  

    try:
        if language == "english":
            
            filename = os.path.join(STORY_SAVE_PATH, f"story_english.txt")
            with open(filename, "w", encoding="utf-8") as file:
                file.write(selected_story)
            
            messagebox.showinfo("Success", f"English story saved:\n{filename}")

        elif language == "kannada":
            
            kannada_story = translator.translate(selected_story)  # Translate using deep_translator
            filename = os.path.join(STORY_SAVE_PATH, f"story_kannada.txt")
            with open(filename, "w", encoding="utf-8") as file:
                file.write(kannada_story)
            
            messagebox.showinfo("Success", f"Kannada story saved:\n{filename}")

        elif language == "both":
            
            kannada_story = translator.translate(selected_story)  

            english_filename = os.path.join(STORY_SAVE_PATH, f"story_english.txt")
            kannada_filename = os.path.join(STORY_SAVE_PATH, f"story_kannada.txt")

            with open(english_filename, "w", encoding="utf-8") as file:
                file.write(selected_story)

            with open(kannada_filename, "w", encoding="utf-8") as file:
                file.write(kannada_story)
            
            messagebox.showinfo("Success", f"Both English & Kannada stories saved:\n{english_filename}\n{kannada_filename}")

    except Exception as e:
        messagebox.showerror("Error", f"Failed to save story: {e}")



# Create Main Window
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("500x600")

# Create a Main Frame to Hold Canvas and Scrollbar
main_frame = tk.Frame(window)
main_frame.pack(fill=tk.BOTH, expand=True)

# Create a Canvas for Scrolling
canvas = tk.Canvas(main_frame)
canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

# Add a Scrollbar Linked to the Canvas
scrollbar = ttk.Scrollbar(main_frame, orient=tk.VERTICAL, command=canvas.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
canvas.configure(yscrollcommand=scrollbar.set)

# Create Scrollable Frame Inside Canvas
content_frame = tk.Frame(canvas)
canvas_window = canvas.create_window((0, 0), window=content_frame, anchor="nw")

# Function to Update Scroll Region
def update_scroll_region(event):
    canvas.configure(scrollregion=canvas.bbox("all"))

content_frame.bind("<Configure>", update_scroll_region)

# Enable Mouse Wheel Scrolling
def on_canvas_scroll(event):
    canvas.yview_scroll(-1 * (event.delta // 120), "units")

canvas.bind_all("<MouseWheel>", on_canvas_scroll)

# 🔹 **Widgets Inside Scrollable Frame**
image_label = tk.Label(content_frame)
image_label.pack()

caption_label = tk.Label(content_frame, text="Caption: ", font=("Arial", 12))
caption_label.pack()

# Upload Section
upload_frame = tk.Frame(content_frame)
upload_frame.pack(pady=5)

upload_button = tk.Button(upload_frame, text="Upload Image", command=lambda: toggle_upload_buttons())
upload_button.pack()

upload_from_device_button = tk.Button(upload_frame, text="From Device", command=lambda: upload_image("device"))
upload_from_folder_button = tk.Button(upload_frame, text="From Folder", command=lambda: upload_image("folder"))

def toggle_upload_buttons():
    """Toggle the visibility of upload options."""
    buttons = [upload_from_device_button, upload_from_folder_button]
    if buttons[0].winfo_ismapped():
        for btn in buttons:
            btn.pack_forget()
    else:
        for btn in buttons:
            btn.pack(pady=2)

tk.Button(content_frame, text="Capture Image", command=capture_image).pack()

# Story Generation Section
story_count_spinbox = tk.Spinbox(content_frame, from_=1, to=10)
story_count_spinbox.pack(pady=5)

tk.Button(content_frame, text="Generate Stories", command=generate_stories).pack()

story_listbox = tk.Listbox(content_frame, height=5)
story_listbox.pack()

story_text = tk.Text(content_frame, height=5, wrap=tk.WORD)
story_text.pack()

kannada_text = tk.Text(content_frame, height=5, wrap=tk.WORD)
kannada_text.pack()

            
def update_efficiency_label():
    if efficiency_label.winfo_exists():  # ✅ Check if the widget exists before updating
        efficiency_label.config(text="Updated Efficiency Metrics!")
# 🏆 Efficiency Metrics Display
efficiency_label = tk.Label(content_frame, text="Efficiency Metrics: ", wraplength=400, justify="center", font=("Arial", 12, "bold"))
efficiency_label.pack(pady=10)

tk.Button(content_frame, text="Read English Story", command=read_english_story).pack(pady=2)
tk.Button(content_frame, text="Read Kannada Story", command=read_kannada_story).pack(pady=2)

# Save Buttons Frame
save_frame = tk.Frame(content_frame)
save_frame.pack(pady=5)

# Save Audio Section
save_audio_button = tk.Button(save_frame, text="Save Audio", command=lambda: toggle_save_audio_buttons())
save_audio_button.pack()

record_english = tk.Button(save_frame, text="Record English", command=lambda: record_audio("english"))
record_kannada = tk.Button(save_frame, text="Record Kannada", command=lambda: record_audio("kannada"))
record_both = tk.Button(save_frame, text="Record Both", command=lambda: record_audio("both"))

def toggle_save_audio_buttons():
    """Toggle the visibility of save audio options."""
    buttons = [record_english, record_kannada, record_both]
    if buttons[0].winfo_ismapped():
        for btn in buttons:
            btn.pack_forget()
    else:
        for btn in buttons:
            btn.pack(pady=2)


# Save Story Section
save_story_button = tk.Button(save_frame, text="Save Story", command=lambda: toggle_save_story_buttons())
save_story_button.pack()

save_story_english = tk.Button(save_frame, text="English", command=lambda: save_story("english"))
save_story_kannada = tk.Button(save_frame, text="Kannada", command=lambda: save_story("kannada"))
save_story_both = tk.Button(save_frame, text="Both", command=lambda: save_story("both"))

def toggle_save_story_buttons():
    """Toggle the visibility of save story options."""
    buttons = [save_story_english, save_story_kannada, save_story_both]
    if buttons[0].winfo_ismapped():
        for btn in buttons:
            btn.pack_forget()
    else:
        for btn in buttons:
            btn.pack(pady=2)

# Run Tkinter Main Loop
window.mainloop()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask an