In [2]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar
import time
import pyttsx3
import os
from deep_translator import GoogleTranslator
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
import nltk

nltk.download('wordnet')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")
tts_engine = pyttsx3.init()

def speak_kannada(text, filename):
    os.makedirs("audio_stories", exist_ok=True)
    filepath = os.path.join("audio_stories", filename)
    tts = gTTS(text=text, lang="kn")
    tts.save(filepath)
    audio = AudioSegment.from_file(filepath, format="mp3")
    play(audio)

def load_image(path):
    return Image.open(path).convert("RGB")

def generate_caption(image):
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    return blip_processor.decode(caption_ids[0], skip_special_tokens=True)

def generate_story(caption, max_length=150):
    enriched_prompt = f"{caption}. Once upon a time, "
    inputs = gpt_tokenizer.encode(enriched_prompt, return_tensors="pt").to(device)
    outputs = gpt_model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.8)
    return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)

def translate_to_kannada(text):
    return translator.translate(text)

def calculate_metrics(story, caption):
    reference = caption.split()
    hypothesis = story.split()
    bleu = sentence_bleu([reference], hypothesis)
    meteor = meteor_score([reference], hypothesis)
    return bleu, meteor

def image_to_story(path):
    start_time = time.time()
    image = load_image(path)
    caption = generate_caption(image)
    story = generate_story(caption)
    kannada_story = translate_to_kannada(story)
    time_taken = time.time() - start_time
    efficiency = len(story.split()) / time_taken if time_taken > 0 else 0
    bleu, meteor = calculate_metrics(story, caption)
    
    os.makedirs("generated_stories", exist_ok=True)
    story_filename = os.path.join("generated_stories", "story.txt")
    with open(story_filename, "w", encoding="utf-8") as f:
        f.write(f"Caption: {caption}\n\n{story}\n\nKannada Story:\n{kannada_story}")
    
    speak_kannada(kannada_story, "audio_stories/kannada_story.mp3")
    return caption, story, kannada_story, time_taken, efficiency, bleu, meteor

def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk
        
        caption, story, kannada_story, time_taken, efficiency, bleu, meteor = image_to_story(file_path)
        caption_label.config(text=f"Caption: {caption}")
        
        story_text.delete("1.0", tk.END)
        story_text.insert(tk.END, story)
        
        kannada_text.delete("1.0", tk.END)
        kannada_text.insert(tk.END, kannada_story)
        
        efficiency_label.config(text=f"Time Taken: {time_taken:.2f}s\nStory Length: {len(story.split())} words\nEfficiency: {efficiency:.2f} words/s\nBLEU: {bleu:.4f}\nMETEOR: {meteor:.4f}")

def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        tts_engine.say(story)
        tts_engine.runAndWait()

def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        speak_kannada(kannada_story, "IR FINAL\\audio_stories\\kannada_story.mp3")

window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("600x800")

image_label = Label(window)
image_label.pack(pady=10)

upload_button = Button(window, text="Upload Image", command=upload_image)
upload_button.pack()

caption_label = Label(window, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)

story_text = Text(window, wrap="word", width=60, height=10)
story_text.pack(pady=10)

kannada_text = Text(window, wrap="word", width=60, height=10)
kannada_text.pack(pady=10)

efficiency_label = Label(window, text="Efficiency Metrics: ", wraplength=400, justify="center")
efficiency_label.pack(pady=10)

read_english_button = Button(window, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(window, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

# New Label for Evaluation Metrics
evaluation_label = Label(window, text="Evaluation Metrics: ", wraplength=400, justify="center", font=("Arial", 12, "bold"))
evaluation_label.pack(pady=10)

window.mainloop()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ansl6\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\ansl6\anaconda3\Lib\tkinter\__init__.py", line 1968, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\ansl6\AppData\Local\Temp\ipykernel_12476\958172938.py", line 89, in upload_image
    caption, story, kannada_story, time_taken, efficiency, bleu

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, Label, Button, Text, Scrollbar, Listbox
import time
import pyttsx3
from deep_translator import GoogleTranslator
import random
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
import cv2
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import spacy
import numpy as np

nlp = spacy.load("en_core_web_sm")
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

translator = GoogleTranslator(source="en", target="kn")
tts_engine = pyttsx3.init()


efficiency_metrics = {
    "image_processing_time": 0,
    "story_generation_time": 0,
    "translation_time": 0,
    "audio_generation_time": 0,
    "BLEU_score": 0,
    "METEOR_score": 0,
    "CIDEr_score": 0,
    "SPICE_score": 0
}

def compute_cider(reference, hypothesis):
    """
    A simplified CIDEr score calculation using TF-IDF weighting.
    """
    ref_words = reference.lower().split()
    hyp_words = hypothesis.lower().split()
    
    if not ref_words or not hyp_words:
        return 0.0
    
    # Compute Term Frequency (TF)
    ref_tf = {word: ref_words.count(word) / len(ref_words) for word in set(ref_words)}
    hyp_tf = {word: hyp_words.count(word) / len(hyp_words)}
    
    # Compute Inverse Document Frequency (IDF)
    all_words = set(ref_words + hyp_words)
    idf = {word: np.log(1 + (1 / (1 + ref_words.count(word) + 1e-6))) for word in all_words}
    
    # Compute TF-IDF score
    cider_score = sum(hyp_tf.get(word, 0) * idf.get(word, 0) for word in hyp_words)
    
    return round(cider_score, 4)

def evaluate_caption(reference, hypothesis):
    if not reference.strip() or not hypothesis.strip():
        return
    
    # BLEU Score with Smoothing
    smooth = SmoothingFunction().method1
    efficiency_metrics["BLEU_score"] = sentence_bleu([reference.split()], hypothesis.split(), smoothing_function=smooth)

    # METEOR Score
    efficiency_metrics["METEOR_score"] = meteor_score([reference], hypothesis)

    # CIDEr Score (Using simplified TF-IDF method)
    efficiency_metrics["CIDEr_score"] = compute_cider(reference, hypothesis)

    # SPICE Score (Using SpaCy Semantic Similarity)
    ref_doc = nlp(reference)
    hyp_doc = nlp(hypothesis)
    efficiency_metrics["SPICE_score"] = ref_doc.similarity(hyp_doc)

    update_efficiency_metrics()

def update_efficiency_metrics():
    metrics_text = (
        f"📸 Image Processing: {efficiency_metrics['image_processing_time']:.2f} sec\n"
        f"📖 Story Generation: {efficiency_metrics['story_generation_time']:.2f} sec\n"
        f"🌍 Translation: {efficiency_metrics['translation_time']:.2f} sec\n"
        f"🔊 Audio Generation: {efficiency_metrics['audio_generation_time']:.2f} sec\n"
        f"🔢 BLEU Score: {efficiency_metrics['BLEU_score']:.4f}\n"
        f"📊 METEOR Score: {efficiency_metrics['METEOR_score']:.4f}\n"
        f"📈 CIDEr Score: {efficiency_metrics['CIDEr_score']:.4f}\n"
        f"🧠 SPICE Score: {efficiency_metrics['SPICE_score']:.4f}"
    )
    efficiency_label.config(text=metrics_text)


def speak_kannada(text, filename="output.mp3"):
    tts = gTTS(text=text, lang="kn")
    tts.save(filename)
    audio = AudioSegment.from_file(filename, format="mp3")
    play(audio)

def load_image(path):
    return Image.open(path).convert("RGB")

def generate_caption(image):
    start_time = time.time()
    inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
    efficiency_metrics["image_processing_time"] = time.time() - start_time
    update_efficiency_metrics()
    return caption

def generate_story(caption, max_length=150):
    start_time = time.time()

    story_styles = [
        "A magical adventure unfolds where",
        "A suspenseful tale emerges involving",
        "A heartwarming story takes place as",
        "An unexpected journey begins when",
        "A dramatic event changes everything when",
        "A mysterious secret is revealed when",
        "A brave hero faces a great challenge when",
        "A thrilling discovery is made as",
        "A peaceful moment turns into an epic tale when",
        "A legendary event occurs as"
    ]
    random_prompt = random.choice(story_styles)
    enriched_prompt = f"The image shows {caption}. {random_prompt} unexpected events, emotions, and resolutions."

    inputs = gpt_tokenizer.encode_plus(enriched_prompt, return_tensors="pt", max_length=50, truncation=True).to(device)
    outputs = gpt_model.generate(
        inputs["input_ids"], max_length=max_length, num_return_sequences=1,
        no_repeat_ngram_size=3, temperature=1.2, top_k=50, top_p=0.90,
        pad_token_id=gpt_tokenizer.eos_token_id
    )

    story = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    efficiency_metrics["story_generation_time"] = time.time() - start_time
    update_efficiency_metrics()
    return story

def generate_multiple_stories(caption, num_stories=10):
    return [generate_story(caption) for _ in range(num_stories)]

def upload_image():
    file_path = filedialog.askopenfilename()
    if file_path:
        img = Image.open(file_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        caption = generate_caption(load_image(file_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)
        
def capture_image():
    cap = cv2.VideoCapture(0)  # Open the webcam
    ret, frame = cap.read()  # Capture a frame
    cap.release()  # Release the webcam
    
    if ret:
        img_path = "captured_image.jpg"
        cv2.imwrite(img_path, frame)  # Save the captured image

        # Load and display the captured image
        img = Image.open(img_path).resize((300, 300))
        img_tk = ImageTk.PhotoImage(img)
        image_label.config(image=img_tk)
        image_label.image = img_tk

        # Process the image
        caption = generate_caption(load_image(img_path))
        caption_label.config(text=f"Caption: {caption}")
        stories = generate_multiple_stories(caption)

        story_listbox.delete(0, tk.END)
        for i in range(10):
            story_listbox.insert(tk.END, f"Story {i + 1}")

        efficiency_label.config(text="Generated 10 unique stories.")

        def on_story_select(event):
            selected_index = story_listbox.curselection()
            if selected_index:
                index = selected_index[0]
                story = stories[index]

                start_time = time.time()
                kannada_story = translator.translate(story)
                efficiency_metrics["translation_time"] = time.time() - start_time
                update_efficiency_metrics()

                story_text.delete("1.0", tk.END)
                story_text.insert(tk.END, story)

                kannada_text.delete("1.0", tk.END)
                kannada_text.insert(tk.END, kannada_story)

        story_listbox.bind("<<ListboxSelect>>", on_story_select)


def read_english_story():
    story = story_text.get("1.0", tk.END).strip()
    if story:
        start_time = time.time()
        tts_engine.say(story)
        tts_engine.runAndWait()
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

def read_kannada_story():
    kannada_story = kannada_text.get("1.0", tk.END).strip()
    if kannada_story:
        start_time = time.time()
        speak_kannada(kannada_story)
        efficiency_metrics["audio_generation_time"] = time.time() - start_time
        update_efficiency_metrics()

# GUI Setup
window = tk.Tk()
window.title("Image to Story Generator")
window.geometry("900x800")

main_frame = tk.Frame(window)
main_frame.pack(fill="both", expand=True)

canvas = tk.Canvas(main_frame)
scrollbar = Scrollbar(main_frame, orient="vertical", command=canvas.yview)
scrollable_frame = tk.Frame(canvas)
scrollable_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)

scrollbar.pack(side="right", fill="y")
canvas.pack(side="left", fill="both", expand=True)

image_label = Label(scrollable_frame)
image_label.pack(pady=10)

upload_button = Button(scrollable_frame, text="Upload Image", command=upload_image)
upload_button.pack()

capture_button = Button(scrollable_frame, text="Capture Image", command=capture_image)
capture_button.pack()

caption_label = Label(scrollable_frame, text="Caption: ", wraplength=400, justify="center")
caption_label.pack(pady=10)



story_listbox = Listbox(scrollable_frame, height=10)
story_listbox.pack(pady=10)

story_text = Text(scrollable_frame, wrap="word", width=70, height=10)
story_text.pack(pady=10)

kannada_text = Text(scrollable_frame, wrap="word", width=70, height=10)
kannada_text.pack(pady=10)

# 🏆 Efficiency Metrics Display
efficiency_label = Label(scrollable_frame, text="Efficiency Metrics: ", wraplength=400, justify="center", font=("Arial", 12, "bold"))
efficiency_label.pack(pady=10)



read_english_button = Button(scrollable_frame, text="Read English Story", command=read_english_story)
read_english_button.pack(pady=5)

read_kannada_button = Button(scrollable_frame, text="Read Kannada Story", command=read_kannada_story)
read_kannada_button.pack(pady=5)

window.mainloop()