In [1]:
!pip install gradio 
!pip install torch 
!pip install transformers 
!pip install langdetect

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Colle

In [2]:
!pip install translate

Collecting translate
  Downloading translate-3.6.1-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting libretranslatepy==2.1.1 (from translate)
  Downloading libretranslatepy-2.1.1-py3-none-any.whl.metadata (233 bytes)
Downloading translate-3.6.1-py2.py3-none-any.whl (12 kB)
Downloading libretranslatepy-2.1.1-py3-none-any.whl (3.2 kB)
Installing collected packages: libretranslatepy, translate
Successfully installed libretranslatepy-2.1.1 translate-3.6.1


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from langdetect import detect
from translate import Translator
import gradio as gr

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [6]:
model_name = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [24]:
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to end-of-sequence token

In [10]:
def chat_with_ai(user_input):
    if not user_input:
        return "Please enter a medical question."

    # Detect language
    lang = detect(user_input)
    original_lang = lang

    # Translate to English if not already in English
    if lang != "en":
        try:
            translator = Translator(from_lang=lang, to_lang="en")
            user_input = translator.translate(user_input)
        except Exception as e:
            return f"Translation failed: {str(e)}. Please try again."

    # Enhanced prompt for concise, medical-only responses
    prompt = (
        "You are a medical AI assistant. Provide a concise, accurate, and strictly medical response. "
        "Stop after a complete, brief answer. Avoid repetition, stories, or unnecessary details.\n\n"
        f"User Question: {user_input}\n"
        "Medical Answer:"
    )

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Define stopping phrases to end generation cleanly
    stop_phrases = ["supportive care", "lifestyle changes", "medical attention", "\n\n"]
    stop_token_ids = [tokenizer.encode(phrase, add_special_tokens=False)[-1] for phrase in stop_phrases]

    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,  # Enough for a complete answer
        temperature=0.1,     # Low randomness for precision
        top_p=0.9,          # Balanced output
        do_sample=True,      # Enable temperature and top_p
        stopping_criteria=StoppingCriteriaList([CustomStoppingCriteria(stop_token_ids)])
    )

    # Decode and clean response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.replace(prompt, "").strip()

    # Trim response at the last occurrence of a stopping phrase
    for phrase in stop_phrases:
        if phrase in response:
            response = response[:response.index(phrase) + len(phrase)]
            break

    # Translate back to original language if not English
    if original_lang != "en":
        try:
            translator = Translator(from_lang="en", to_lang=original_lang)
            response = translator.translate(response)
        except Exception as e:
            return f"Translation back failed: {str(e)}. Response in English: {response}"

    return response

In [11]:
from transformers import StoppingCriteria, StoppingCriteriaList

class CustomStoppingCriteria(StoppingCriteria):
    def __init__(self, stop_token_ids):
        self.stop_token_ids = stop_token_ids

    def __call__(self, input_ids, scores, **kwargs):
        # Check if the last generated token is in stop_token_ids
        last_token = input_ids[0, -1].item()
        return last_token in self.stop_token_ids

In [12]:
with gr.Blocks(css="""
    body {background-color: #FFFFFF; font-family: Arial, sans-serif;}
    .header {background-color: #4A4A9D; color: white; padding: 20px; text-align: left; border-radius: 10px;}
    #chat_section {margin-top: 20px;}
    .input-box {width: 100%; margin-bottom: 10px;}
    .submit-btn {background-color: #4A4A9D; color: white; border-radius: 5px; padding: 5px 15px; margin-top: 10px;}
    .response-box {border: 1px solid #CCC; padding: 10px; border-radius: 5px;margin-top: 10px;}
    .disclaimer {margin-top: 15px; color: #888; font-size: 12px; text-align: center;}
""") as demo:
    gr.Markdown("""
    <div class='header'>
        <h2>Medical AI Chatbot</h2>
        <p>Your assistant for understanding medical concerns. Type your question below.</p>
    </div>
    """)

    with gr.Row(elem_id="chat_section"):
        with gr.Column():
            user_input = gr.Textbox(lines=2, placeholder="Enter your medical question...", elem_classes="input-box", label="Your Question")
            submit_btn = gr.Button("Submit", elem_classes="submit-btn")
        response_output = gr.Textbox(lines=5, label="AI Response", interactive=False, elem_classes="response-box")
    submit_btn.click(chat_with_ai, inputs=user_input, outputs=response_output)

    gr.Markdown("""
    <div class='disclaimer'>
        Disclaimer: This chatbot is for informational purposes only. Consult a healthcare professional for medical advice.
    </div>
    """)

In [13]:
def launch_interface():
    try:
        demo.launch(server_port=7860, share=True)
    except OSError:
        print("Port 7860 busy, trying an alternative port...")
        demo.launch(share=True)  # Automatically picks an available port

# Run the app
if __name__ == "__main__":
    launch_interface()

Port 7860 busy, trying an alternative port...
* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://567e5c661c08c27fa8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
