In [None]:
# Use BLIP to generate caption instead of fixed default_labels

import gradio as gr
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
from deep_translator import GoogleTranslator
from gtts import gTTS
import tempfile
import torch

In [None]:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load BLIP for caption generation
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Languages
languages = {
    "English": "en", "Korean": "ko", "Japanese": "ja",
    "Chinese": "zh-CN", "Russian": "ru", "French": "fr",
    "Vietnamese": "vi", "Spanish": "es", "Portuguese": "pt"
}

In [None]:
# Translate
def translate_word(word, lang_code):
    try:
        return GoogleTranslator(source="auto", target=lang_code).translate(word)
    except Exception as e:
        print("Translation error:", e)
        return "(‡πÅ‡∏õ‡∏•‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ)"

# Text-to-speech
def speak_word(text, lang_code):
    try:
        tts = gTTS(text=text, lang=lang_code)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return tmp.name
    except Exception as e:
        print("TTS error:", e)
        return None

# Predict using BLIP + translate + TTS
def predict(image, selected_langs):
    # ‡πÉ‡∏ä‡πâ BLIP ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢‡∏†‡∏≤‡∏û (caption ‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©)
    inputs = blip_processor(image, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs)
    eng_caption = blip_processor.decode(out[0], skip_special_tokens=True)

    # ‡πÅ‡∏õ‡∏•‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏ó‡∏¢‡πÉ‡∏´‡πâ‡πÅ‡∏™‡∏î‡∏á‡∏ó‡∏µ‡πà‡∏ä‡πà‡∏≠‡∏á prediction_text
    caption_thai = translate_word(eng_caption, "th")

    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏™‡πà‡∏á‡∏Å‡∏•‡∏±‡∏ö
    outputs = {"label": caption_thai}
    for lang in languages:
        if lang in selected_langs:
            translated = translate_word(eng_caption, languages[lang])
            audio_path = speak_word(translated, languages[lang])
            outputs[lang] = (translated, audio_path)
        else:
            outputs[lang] = ("", None)

    return [outputs["label"]] + [outputs[lang][0] for lang in languages] + [gr.update(value=outputs[lang][1], visible=bool(outputs[lang][1])) for lang in languages]

In [None]:
# UI
with gr.Blocks() as app:
    gr.Markdown("# üì∏ SnapTranslate By Pawit\n‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡∏†‡∏≤‡∏û ‚Üí ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢ ‚Üí ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏†‡∏≤‡∏©‡∏≤‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£ ‚Üí ‡∏Å‡∏î‡πÅ‡∏õ‡∏•‡∏†‡∏≤‡∏©‡∏≤‡πÅ‡∏•‡∏∞‡∏Å‡∏î‡∏ü‡∏±‡∏á‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡∏†‡∏≤‡∏©‡∏≤‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£")

    with gr.Row():
        image_input = gr.Image(type="pil", label="üì§ ‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡∏£‡∏π‡∏õ‡∏†‡∏≤‡∏û")
        lang_select = gr.CheckboxGroup(choices=list(languages.keys()), label="üåê ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏†‡∏≤‡∏©‡∏≤‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡πÅ‡∏õ‡∏•")

    predict_btn = gr.Button("üß† ‡πÅ‡∏õ‡∏•‡∏†‡∏≤‡∏û")
    prediction_text = gr.Textbox(label="üîç ‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢‡∏à‡∏≤‡∏Å‡∏†‡∏≤‡∏û")

    output_texts = {}
    audio_outputs = {}

    for lang in languages:
        with gr.Row():
            output_texts[lang] = gr.Textbox(label=f"{lang}:", visible=True)
            audio_outputs[lang] = gr.Audio(label="", visible=False)

    predict_btn.click(
        fn=predict,
        inputs=[image_input, lang_select],
        outputs=[prediction_text] + list(output_texts.values()) + list(audio_outputs.values())
    )

app.launch(debug=True, share=True)