In [1]:
import gradio as gr
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
from deep_translator import GoogleTranslator
from gtts import gTTS
from ultralytics import YOLO
import tempfile
import torch
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ----- ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡πÇ‡∏°‡πÄ‡∏î‡∏• -----
device = "cuda" if torch.cuda.is_available() else "cpu"

# YOLOv8
yolo_model = YOLO("yolov8s.pt")

# BLIP
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ‡∏†‡∏≤‡∏©‡∏≤
languages = {
    "‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©": "en", "‡πÄ‡∏Å‡∏≤‡∏´‡∏•‡∏µ": "ko", "‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô": "ja",
    "‡∏à‡∏µ‡∏ô": "zh-CN", "‡∏£‡∏±‡∏™‡πÄ‡∏ã‡∏µ‡∏¢": "ru", "‡∏ù‡∏£‡∏±‡πà‡∏á‡πÄ‡∏®‡∏™": "fr",
    "‡πÄ‡∏ß‡∏µ‡∏¢‡∏î‡∏ô‡∏≤‡∏°": "vi", "‡∏™‡πÄ‡∏õ‡∏ô": "es", "‡πÇ‡∏õ‡∏£‡∏ï‡∏∏‡πÄ‡∏Å‡∏™": "pt"
}

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
# ----- ‡∏¢‡∏π‡∏ó‡∏¥‡∏•‡∏¥‡∏ï‡∏µ‡πâ -----
output_dir = os.path.join(os.getcwd(), "tts_cache")
os.makedirs(output_dir, exist_ok=True)

def translate_word(word, lang_code):
    try:
        return GoogleTranslator(source="auto", target=lang_code).translate(word)
    except Exception as e:
        print("Translation error:", e)
        return "(‡πÑ‡∏°‡πà‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡πÅ‡∏õ‡∏•‡πÑ‡∏î‡πâ)"

def speak_word(text, lang_code):
    try:
        filename = os.path.join(output_dir, f"{lang_code}_{abs(hash(text))}.mp3")
        if not os.path.exists(filename):
            tts = gTTS(text=text, lang=lang_code)
            tts.save(filename)
        return filename
    except Exception as e:
        print("TTS error:", e)
        return None

# ----- ‡πÇ‡∏´‡∏°‡∏î BLIP -----
def predict_blip(image, selected_langs):
    inputs = blip_processor(image, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs)
    eng_caption = blip_processor.decode(out[0], skip_special_tokens=True)

    caption_thai = translate_word(eng_caption, "th")
    outputs = {"label": caption_thai}

    text_results = []
    audio_results = []
    row_results = []

    for lang in languages:
        if lang in selected_langs:
            translated = translate_word(eng_caption, languages[lang])
            audio_path = speak_word(translated, languages[lang])
            outputs[lang] = (translated, audio_path)

            text_results.append(translated)
            audio_results.append(gr.update(value=audio_path, visible=True))
            row_results.append(gr.update(visible=True))
        else:
            text_results.append("")
            audio_results.append(gr.update(visible=False))
            row_results.append(gr.update(visible=False))

    return [outputs["label"]] + text_results + audio_results + row_results

# ----- ‡πÇ‡∏´‡∏°‡∏î YOLO -----
def predict_yolo(image, selected_langs):
    results = yolo_model(image)[0]
    if len(results.boxes) == 0:
        raise gr.Error("‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏‡πÄ‡∏î‡πà‡∏ô‡πÉ‡∏ô‡∏†‡∏≤‡∏û")

    best_box = max(results.boxes, key=lambda box: box.conf.cpu().item())
    class_id = int(best_box.cls.cpu().item())
    label = yolo_model.model.names[class_id]

    caption_thai = translate_word(label, "th")
    outputs = {"label": caption_thai}

    text_results = []
    audio_results = []
    row_results = []

    for lang in languages:
        if lang in selected_langs:
            translated = translate_word(label, languages[lang])
            audio_path = speak_word(translated, languages[lang])
            outputs[lang] = (translated, audio_path)

            text_results.append(translated)
            audio_results.append(gr.update(value=audio_path, visible=True))
            row_results.append(gr.update(visible=True))
        else:
            text_results.append("")
            audio_results.append(gr.update(visible=False))
            row_results.append(gr.update(visible=False))

    return [outputs["label"]] + text_results + audio_results + row_results


In [4]:
# ----- ‡∏™‡∏£‡πâ‡∏≤‡∏á UI -----
with gr.Blocks() as app:
    gr.Markdown("""
    # üì∏ SnapTranslate by Pawit
    üì≤ ‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡∏£‡∏π‡∏õ‡∏†‡∏≤‡∏û ‚Üí ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÇ‡∏´‡∏°‡∏î ‚Üí ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏†‡∏≤‡∏©‡∏≤ ‚Üí ‡πÅ‡∏õ‡∏• ‚Üí ‡∏ü‡∏±‡∏á‡πÄ‡∏™‡∏µ‡∏¢‡∏á
    """)

    with gr.Row():
        image_input = gr.Image(type="pil", label="üìÑ ‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡∏†‡∏≤‡∏û")
        mode = gr.Radio(["‡∏ö‡∏£‡∏£‡∏¢‡∏≤‡∏¢‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏/‡∏™‡∏¥‡πà‡∏á‡∏°‡∏µ‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï/‡∏≠‡∏∑‡πà‡∏ô‡πÜ", "‡∏ö‡∏£‡∏£‡∏¢‡∏≤‡∏¢‡∏ó‡∏∏‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÉ‡∏ô‡∏£‡∏π‡∏õ‡∏†‡∏≤‡∏û"], label="üß† ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÇ‡∏´‡∏°‡∏î‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏†‡∏≤‡∏û")

    lang_select = gr.CheckboxGroup(choices=list(languages.keys()), label="üåê ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏†‡∏≤‡∏©‡∏≤‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡πÅ‡∏õ‡∏•")
    predict_btn = gr.Button("üîç ‡πÅ‡∏õ‡∏•‡∏†‡∏≤‡∏û")
    clear_btn = gr.Button("üóëÔ∏è ‡∏•‡πâ‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•")
    prediction_text = gr.Textbox(label="üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå", lines=1, max_lines=2)

    output_texts = {}
    audio_outputs = {}
    row_components = {}

    for lang in languages:
        with gr.Row(visible=False) as row:
            output_texts[lang] = gr.Textbox(label=f"{lang}:", visible=True, scale=3)
            audio_outputs[lang] = gr.Audio(label="", visible=False, scale=1)
        row_components[lang] = row

    def predict(image, mode_select, langs):
        if not image or not mode_select or not langs:
            raise gr.Error("‚ùó ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡∏£‡∏π‡∏õ‡∏†‡∏≤‡∏û ‡πÅ‡∏•‡∏∞‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÇ‡∏´‡∏°‡∏î‡∏´‡∏£‡∏∑‡∏≠‡∏†‡∏≤‡∏©‡∏≤‡πÉ‡∏´‡πâ‡∏Ñ‡∏£‡∏ö‡∏ñ‡πâ‡∏ß‡∏ô")
        return predict_yolo(image, langs) if "‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏" in mode_select else predict_blip(image, langs)

    predict_btn.click(
        fn=predict,
        inputs=[image_input, mode, lang_select],
        outputs=[prediction_text]
            + list(output_texts.values())
            + list(audio_outputs.values())
            + list(row_components.values())
    )
    
    def clear_all():
        return (
            gr.update(value=None),     # image_input
            gr.update(value=[]),       # lang_select (Clear language selection)
            "",                        # prediction_text
            *[""] * len(output_texts),  # output_texts
            *[gr.update(value=None, visible=False)] * len(audio_outputs),  # audio_outputs
            *[gr.update(visible=False)] * len(row_components)  # row_components
    )

    clear_btn.click(
        fn=clear_all,
        inputs=[],
        outputs=[image_input, prediction_text, lang_select]
            + list(output_texts.values())
            + list(audio_outputs.values())
            + list(row_components.values())
    )


app.launch(debug=True, share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://79efb75cebb3fb78cb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



0: 640x448 1 person, 1 bottle, 66.6ms
Speed: 2.3ms preprocess, 66.6ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 448)
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://79efb75cebb3fb78cb.gradio.live


