In [1]:
!pip install langchain langchain-community langgraph sentence-transformers transformers accelerate bitsandbytes gliner medspacy spacy fastapi uvicorn faiss-cpu --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.6/244.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.9/69.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [6]:
# ======================
# Install Dependencies
# ======================
!pip install langchain langchain-community langchain-core langgraph sentence-transformers transformers accelerate bitsandbytes gliner annoy --quiet

# ======================
# Imports
# ======================
import pickle
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Annoy
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.runnables import RunnableConfig
from langgraph.graph import StateGraph, END
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.memory import ConversationEntityMemory, InMemoryEntityStore, VectorStoreRetrieverMemory
from gliner import GLiNER

# ======================
# ---- Pickle Setup ----
# ======================
MEMORY_FILE = "conversation_memory.pkl"
ENTITY_FILE = "entity_memory.pkl"

def load_pickle(path, default):
    try:
        with open(path, "rb") as f:
            return pickle.load(f)
    except FileNotFoundError:
        return default

def save_pickle(path, obj):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_memory():
    return load_pickle(MEMORY_FILE, [])

def save_memory(memory):
    save_pickle(MEMORY_FILE, memory)

def load_entities():
    return load_pickle(ENTITY_FILE, {})

def save_entities(entities):
    save_pickle(ENTITY_FILE, entities)


# ======================
# ---- State ----
# ======================
class State(dict):
    messages: list
    entities: dict


# ======================
# ---- HuggingFace LLM ----
# ======================
model_name = "unsloth/phi-3-mini-4k-instruct-bnb-4bit"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    do_sample=True,
)

llm = HuggingFacePipeline(pipeline=pipe)


# ======================
# ---- Structured Entity Memory ----
# ======================
entity_store = InMemoryEntityStore()
entity_memory = ConversationEntityMemory(llm=llm, entity_store=entity_store)


# ======================
# ---- GLiNER Entity Extractor ----
# ======================
gliner_model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

GLINER_LABELS = [
    "PERSON", "GPE", "ORG", "DATE", "MEDICAL",
    "CONDITION", "SYMPTOM", "MEDICATION", "PROCEDURE", "ANATOMY", "INJURY"
]

def extract_gliner_entities(text: str):
    preds = gliner_model.predict_entities(text, GLINER_LABELS, threshold=0.5)
    entities = {}
    for ent in preds:
        label = ent["label"]
        entities.setdefault(label, set()).add(ent["text"])
    return {k: sorted(list(v)) for k, v in entities.items()}


# ======================
# ---- Vector Memory (FAISS in-memory only) ----
# ======================
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = None
retriever = None
vector_memory = None

# ======================
# ---- Knowledge Base ----
# ======================
KNOWLEDGE_BASE = {
    # General Health FAQs
    "symptoms of flu": "Common flu symptoms include fever, cough, sore throat, body aches, headaches, and fatigue.",
    "symptoms of a common cold": "Common cold symptoms include a runny or stuffy nose, sore throat, sneezing, and a mild cough.",
    "what is dehydration": "Dehydration occurs when your body loses more fluid than you take in. Symptoms include thirst, dark yellow urine, dizziness, and fatigue.",
    "how to treat a minor burn": "For a minor burn, run cool water over the area for several minutes. Do not use ice. After cooling, you can apply lotion or an antibiotic ointment and cover it with a clean bandage.",

    # Healthy Lifestyle Tips
    "tips for better sleep": "To improve sleep, stick to a regular sleep schedule, create a restful environment, limit caffeine and large meals before bed, and get some exercise during the day.",
    "benefits of a balanced diet": "A balanced diet provides essential nutrients, helps maintain a healthy weight, supports your immune system, and reduces the risk of chronic diseases.",
    "how much water to drink": "Most adults should aim to drink about 8 glasses (around 2 liters) of water per day, but this can vary based on activity level and climate.",

    # Hospital Information
    "hospital working hours": "Our hospital is open 24/7 for emergency services. The outpatient department (OPD) is open from 9 AM to 5 PM, Monday to Saturday.",
    "how to book an appointment": "You can book an appointment by using the online portal on IITGn Medical Center's website (https://hcrs.iitgn.ac.in/slotbooking/).",
    "what is the hospital contact number": "For emergencies, please call +91 - 70 69 79 5000. For general inquiries, our landline number is 079-2395-1116.",
    "where is the hospital located": "We are located at Central Arcade First Floor."
}

# ======================
# ---- Searchable Knowledge Index ----
# ======================
from langchain_community.vectorstores import FAISS

# Check if the knowledge base is not empty
if KNOWLEDGE_BASE:
    # Get the questions (keys) and answers (values) from the knowledge base
    knowledge_base_texts = list(KNOWLEDGE_BASE.keys())
    knowledge_base_metadatas = [{"answer": answer} for answer in KNOWLEDGE_BASE.values()]

    # Create a FAISS index from the knowledge base questions
    knowledge_base_index = FAISS.from_texts(
        texts=knowledge_base_texts,
        embedding=embedder,  # Using the same embedder as the conversation memory
        metadatas=knowledge_base_metadatas
    )
    print("✅ Knowledge Base Index created successfully.")
else:
    knowledge_base_index = None
    print("⚠️ Knowledge Base is empty. Skipping index creation.")


def find_relevant_knowledge(user_input: str) -> str:
    """Performs a semantic search on the knowledge base index."""
    if knowledge_base_index:
        # Search for the most similar question in the index
        # It returns the document and its similarity score (0 to 1, lower is better)
        results = knowledge_base_index.similarity_search_with_score(user_input, k=1)
        if results and results[0][1] < 0.5:  # Using a threshold of 0.5
            # If a relevant document is found with a good score, return its answer
            return results[0][0].metadata["answer"]
    return ""

# ======================
# ---- Nodes ----
# ======================
def add_message(state: State, config: RunnableConfig):
    """Persist conversation + entities + vector memory (in RAM only)."""
    global vectordb, retriever, vector_memory

    memory = load_memory()
    memory.extend(state["messages"])
    save_memory(memory)
    save_entities(state.get("entities", {}))

    # Get the text from the latest human message to add to the vector store
    new_text = state["messages"][-1].content if isinstance(state["messages"][-1], HumanMessage) else None

    if new_text:
        if vectordb is None:
            # Create the vector store for the first time
            vectordb = FAISS.from_texts([new_text], embedding=embedder)
            retriever = vectordb.as_retriever(search_kwargs={"k": 3})
            vector_memory = VectorStoreRetrieverMemory(retriever=retriever)
        else:
            # Add the new text to the existing FAISS store
            vectordb.add_texts([new_text])

    return {"messages": memory, "entities": state.get("entities", {})}

def extract_entities(state: State, config: RunnableConfig):
    """Hybrid entity extraction with GLiNER + LLM memory."""
    messages = state.get("messages", [])
    if not messages:
        return state
    last_input = messages[-1].content
    last_output = ""
    if len(messages) >= 2 and isinstance(messages[-1], AIMessage):
        last_output = messages[-1].content
    entity_memory.save_context({"input": last_input}, {"output": last_output})
    llm_entities = entity_store.store
    gliner_entities = extract_gliner_entities(last_input)
    merged = load_entities()
    for k, v in llm_entities.items():
        merged.setdefault(k, [])
        merged[k] = list(set(merged.get(k, [])) | set(v))
    for k, v in gliner_entities.items():
        merged.setdefault(k, [])
        merged[k] = list(set(merged.get(k, [])) | set(v))
    return {"messages": messages, "entities": merged}


def call_llm(state: State, config: RunnableConfig):
    """Call HuggingFace LLM with a properly formatted chat prompt."""
    messages = state.get("messages", [])
    entities = state.get("entities", {})
    user_input = state.get("last_input", messages[-1].content if messages else "")

    # 1. Search the Knowledge Base
    knowledge = find_relevant_knowledge(user_input)

    # 2. Retrieve vector memory
    retrieved_context = ""
    if vector_memory:
        retrieved_context = vector_memory.load_memory_variables(
            {"prompt": user_input}
        ).get("history", "")

    # --- NEW: Create a structured chat prompt ---
    # System message sets the persona and instructions
    system_message = (
        f"You are a helpful assistant for the IIT Gandhinagar Health Centre.\n"
        f"Answer the user's question based on the following context. If the answer is not in the context, use your general knowledge but maintain the persona of a health center assistant.\n"
        f"When a user provides personal details like age, use them to confirm or refine the general information if possible. For example, if general advice applies to their demographic, confirm that for them. Maintain a helpful tone.\n"
        f"Do not give direct medical advice. Avoid simply repeating 'consult a doctor' unless the user is asking for a diagnosis or treatment for a serious issue.\n\n"
        f"--- CONTEXT ---\n"
        f"Knowledge Base Info: {knowledge}\n"
        f"Previous Conversation Snippets: {retrieved_context}\n"
        f"Known Entities: {entities}\n"
        f"--- END CONTEXT ---"
    )

    # Create a message list for the template
    prompt_messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_input},
    ]

    # Use the tokenizer to apply the model's specific chat template
    formatted_prompt = tokenizer.apply_chat_template(prompt_messages, tokenize=False, add_generation_prompt=True)

    # Invoke the LLM with the correctly formatted prompt
    response = llm.invoke(formatted_prompt)
    assistant_reply = response.split("<|assistant|>")[-1].strip()


    return {
        "messages": messages + [AIMessage(content=assistant_reply)],
        "entities": entities
    }


# ======================
# ---- Graph ----
# ======================
workflow = StateGraph(State)
workflow.add_node("chat", call_llm)
workflow.add_node("extract", extract_entities)
workflow.add_node("save_message", add_message)

workflow.set_entry_point("chat")
workflow.add_edge("chat", "extract")
workflow.add_edge("extract", "save_message")
workflow.add_edge("save_message", END)

app = workflow.compile()


# ======================
# ---- Usage ----
# ======================
# user_input = """chiefComplaint Follow-up case of pain and swelling of the left wrist
# after he allegedly slipped and fell while washing for prayer on February 4, 2025,
# with a fracture of the distal radius.
# Other complaint: Pain in the left wrist after lifting a heavy weight at work on April 6, 2025.
# Follow-up case of pain and swelling of the left wrist after he allegedly slipped and fell."""
# state = {"messages": [HumanMessage(content=user_input)], "entities": load_entities()}
# output = app.invoke(state)

# print("AI:", output["messages"][-1].content)
# print("Entities Extracted:", output["entities"])

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m27.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for annoy (setup.py) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)
  entity_store = InMemoryEntityStore()
  entity_memory = ConversationEntityMemory(llm=llm, entity_store=entity_store)


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/781M [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/781M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Knowledge Base Index created successfully.


In [8]:
# # ======================
# # ---- Interactive Conversation Loop ----
# # ======================
# state = {"messages": load_memory(), "entities": load_entities()}

# print("💬 Starting conversation with the agent (type 'exit' to quit)")

# while True:
#     user_input = input("You: ")
#     if user_input.lower() in ["exit", "quit"]:
#         break

#     # Append user message
#     state["messages"].append(HumanMessage(content=user_input))

#     # Run through graph
#     # ✅ Always pass the latest input explicitly for vector memory
#     state = app.invoke(
#         {**state, "last_input": user_input}  # added key for retrieval
#     )

#     # Get AI response
#     ai_response = state["messages"][-1].content
#     print("AI:", ai_response)
#     print("Entities Extracted:", state["entities"])


In [9]:
# ======================
# ---- Gradio UI ----
# ======================
import gradio as gr

# Load initial state once
initial_state = {"messages": load_memory(), "entities": load_entities()}

def chat_interface(user_input, history):
    # Append the new user message to the history
    initial_state["messages"].append(HumanMessage(content=user_input))

    # Run the graph
    final_state = app.invoke({
        "messages": initial_state["messages"],
        "entities": initial_state["entities"],
        "last_input": user_input
    })

    # Update the persistent state
    initial_state["messages"] = final_state["messages"]
    initial_state["entities"] = final_state["entities"]

    # Get the latest AI response
    ai_response = final_state["messages"][-1].content

    # Gradio expects a list of tuples for history: (user_msg, ai_msg)
    history.append((user_input, ai_response))
    return "", history


# Build the Gradio app
with gr.Blocks() as demo:
    gr.Markdown("## 🏥 IITGn Healthcare Information Assistant")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Ask a question about health or hospital services")
    clear = gr.Button("Clear Conversation")

    # --- NEW: Added a static disclaimer to the UI ---
    gr.Markdown(
        "Disclaimer: This is for educational purposes only and not a substitute for professional medical advice."
    )

    msg.submit(chat_interface, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch(debug=True)

  chatbot = gr.Chatbot()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2f89c1837a19632e23.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://2f89c1837a19632e23.gradio.live


