In [None]:
import gradio as gr
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from deep_translator import GoogleTranslator
from gtts import gTTS
from io import BytesIO
import tempfile
import torch

In [None]:
# ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏•
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# ‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÉ‡∏´‡πâ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≤‡∏¢
default_labels = [
    "apple", "bottle", "book", "pen", "scissors", "keyboard",
    "banana", "cup", "phone", "chair", "table", "backpack"
]

# ‡∏†‡∏≤‡∏©‡∏≤
languages = {
    "Thai": "th", "English": "en", "Korean": "ko", "Japanese": "ja",
    "Chinese": "zh-CN", "Russian": "ru", "French": "fr",
    "Vietnamese": "vi", "Spanish": "es", "Portuguese": "pt"
}

In [None]:
# ‡πÅ‡∏õ‡∏•‡∏Ñ‡∏≥
def translate_word(word, lang_code):
    try:
        return GoogleTranslator(source="auto", target=lang_code).translate(word)
    except Exception as e:
        print("Translation error:", e)
        return "(‡πÅ‡∏õ‡∏•‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ)"

# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÄ‡∏™‡∏µ‡∏¢‡∏á
def speak_word(text, lang_code):
    try:
        tts = gTTS(text=text, lang=lang_code)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return tmp.name
    except Exception as e:
        print("TTS error:", e)
        return None

# ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏†‡∏≤‡∏û‡πÅ‡∏•‡∏∞‡πÅ‡∏õ‡∏•
def predict(image, selected_langs):
    inputs = processor(text=default_labels, images=image, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    probs = outputs.logits_per_image.softmax(dim=1)
    pred_idx = probs.argmax().item()
    pred_label = default_labels[pred_idx]

    outputs = {"label": pred_label}
    for lang in languages:
        if lang in selected_langs:
            translated = translate_word(pred_label, languages[lang])
            audio_path = speak_word(translated, languages[lang])
            outputs[lang] = (translated, audio_path)
        else:
            outputs[lang] = ("", None)

    return [outputs["label"]] + [outputs[lang][0] for lang in languages] + [gr.update(value=outputs[lang][1], visible=bool(outputs[lang][1])) for lang in languages]

In [None]:
# ‡∏™‡∏£‡πâ‡∏≤‡∏á UI
with gr.Blocks() as app:
    gr.Markdown("# üì∏ SnapTranslate\n‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡∏†‡∏≤‡∏û ‚Üí ‡∏ó‡∏≤‡∏¢‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏ ‚Üí ‡πÅ‡∏õ‡∏•‡∏´‡∏•‡∏≤‡∏¢‡∏†‡∏≤‡∏©‡∏≤ ‚Üí ‡∏Å‡∏î‡∏ü‡∏±‡∏á‡πÄ‡∏™‡∏µ‡∏¢‡∏á")

    with gr.Row():
        image_input = gr.Image(type="pil", label="üì§ ‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡∏†‡∏≤‡∏û")
        lang_select = gr.CheckboxGroup(choices=list(languages.keys()), label="üåê ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏†‡∏≤‡∏©‡∏≤‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡πÅ‡∏õ‡∏•")

    predict_btn = gr.Button("üß† ‡πÅ‡∏õ‡∏•‡∏†‡∏≤‡∏û")
    prediction_text = gr.Textbox(label="üîç ‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏‡∏ó‡∏µ‡πà AI ‡∏ó‡∏≤‡∏¢‡πÑ‡∏î‡πâ")

    output_texts = {}
    audio_outputs = {}

    for lang in languages:
        with gr.Row():
            output_texts[lang] = gr.Textbox(label=f"{lang}:", visible=True)
            audio_outputs[lang] = gr.Audio(label="", visible=False)

    predict_btn.click(
        fn=predict,
        inputs=[image_input, lang_select],
        outputs=[prediction_text] + list(output_texts.values()) + list(audio_outputs.values())
    )

app.launch(debug=True, share=True)