In [2]:
from getpass import getpass
GOOGLE_API_KEY = getpass()

 ········


In [3]:
%pip install langchain-community tqdm langchain-google-genai ipywidgets langchain-text-splitters lxml

Collecting langchain-community
  Using cached langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
Collecting ipywidgets
  Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-text-splitters
  Using cached langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Collecting lxml
  Using cached lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting langchain-core<1.0.0,>=0.3.37 (from langchain-community)
  Downloading langchain_core-0.3.40-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.19 (from langchain-community)
  Using cached langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain-community)
  Using cached SQLAlchemy-2.0.38-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from 

In [4]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=GOOGLE_API_KEY)

In [7]:
from pydantic import BaseModel, ValidationError, Field
from typing import List, Dict, Any, Set

class Problem(BaseModel):
    name: str
    duration: str
    description: str


class PatientSchema(BaseModel):
    name: str = Field(..., description="Patient's name")
    age: int = Field(..., description="Patient's age")
    gender: str = Field(..., description="Patient's gender")
    address: str = Field(..., description="Patient's address")
    identity: str = Field(..., description="Patient's identity document")
    phone: str = Field(..., description="Patient's phone number")
    problems: List[Problem] = Field(..., description="List of patient's problems")
    conditions: List[Any] = Field(default=[], description="List of patient's conditions")
    description: str = Field(..., description="AI-generated description")
    recommended_doctor: str = Field(..., description="AI-generated doctor recommendation")

In [11]:
def get_missing_fields(data: Dict[str, Any], model = PatientSchema) -> Set[str]:
    try:
        model.model_validate(data)
        return set()
    except ValidationError as e:
        missing_fields = set()
        for error in e.errors():
            if error["type"] == "missing":
                missing_fields.add(error["loc"][0])
        return missing_fields

In [12]:
INCOMPLETE_JSON_SCHEMA = {
        "name": "Rani Sharma",
        "gender": "Female",
        "address": "Udaipur",
    }
missing = get_missing_fields(INCOMPLETE_JSON_SCHEMA)
print(f"Missing fields: {missing}")

Missing fields: {'description', 'identity', 'recommended_doctor', 'age', 'phone', 'problems'}


In [44]:
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate

examples=[ {"input": "Hello, my name is Ishan. I am living here in Hyderabad and I am suffering from fever, headache, and cold. I also have diabetes. This is missing the age field.",
            "output": """
{{
    "think": "First, I will extract the information from the initial sentence.",
    "info": {{
      "name": "Ishan",
      "age": "",
      "gender": "Male",
      "address": "Hyderabad",
      "identity": "",
      "phone": "",
      "problems": [
        {{
          "name": "fever",
          "duration": "",
          "description": "High fever"
        }},
        {{
          "name": "headache",
          "duration": "",
          "description": "Headache localized in the forehead"
        }},
        {{
          "name": "cold",
          "duration": "",
          "description": "",
        }}
      ],
      "conditions": ["diabetes"],
      "description": "[AI GENERATED] A male patient presenting with fever, headache, and cold symptoms, with a known diagnosis of diabetes.",
      "recommended_doctor": "[AI GENERATED] General Physician or Infectious Disease Specialist",
    }},
    "next_question": "What is your age?"
}}

        """, "field": "age"
          }]

In [71]:
example_prompt = ChatPromptTemplate.from_messages(
[('human', 'This is one example: {input}'), ('ai', '{output}')]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
)

In [82]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
"""You are an information extraction model for electronic medical records. 
Your task is to extract relevant medical and personal information from a given sentence and then interactively ask questions to fill in any missing fields. 
Always follow these rules:
1. First, extract as much information as possible from the initial sentence.
2. Identify missing fields and ask the user questions to fill them in, one at a time.
3. Ask questions in a polite and helpful manner.
4. Continue asking questions until all fields are filled or the user declines to answer.
5. Do not assume or hallucinate information not present in the sentence or user responses.
6. Format the output as follows:

{{
    "think": "[Your thought process and explanations]",
    "info": {{
      "name": "extracted name",
      "age": "extracted age",
      "gender": "extracted or inferred gender",
      "address": "extracted address",
      "identity": "extracted identity",
      "phone": "extracted phone number",
      "problems": [
        {{
          "name": "symptom name",
          "duration": "duration of symptom",
          "description": "description regarding the symptom"
        }}
      ],
      "conditions": ["list of pre-existing conditions"],
      "description": "[AI GENERATED] Medically sounding description",
      "recommended_doctor": "[AI GENERATED] Suggested medical specialty"
    }},
    "next_question": "[Your next question]"
}}

Example interaction:

Initial input: "Hello, my name is Ishan. I am living here in Hyderabad and I am suffering from fever, headache, and cold. I also have diabetes."

{{
    "think": "First, I will extract the information from the initial sentence.",
    "info": {{
      "name": "Ishan",
      "age": "",
      "gender": "Male",
      "address": "Hyderabad",
      "identity": "",
      "phone": "",
      "problems": [
        {{
          "name": "fever",
          "duration": "",
          "description": "High fever"
        }},
        {{
          "name": "headache",
          "duration": "",
          "description": "Headache localized in the forehead"
        }},
        {{
          "name": "cold",
          "duration": "",
          "description": "",
        }}
      ],
      "conditions": ["diabetes"],
      "description": "[AI GENERATED] A male patient presenting with fever, headache, and cold symptoms, with a known diagnosis of diabetes.",
      "recommended_doctor": "[AI GENERATED] General Physician or Infectious Disease Specialist",
    }},
    "analysis": "The following fields are missing: age, phone, symptom description, and identity.",
    "next_question": "Please share your age"
}}

Now process this [INPUT_SENTENCE]
"""
        ),
        ("human", "{text} \n This is the current form: {form}"),
    ]
)

In [83]:
import json

example_prompt = prompt.invoke({"text": "hey! I am Ishan! A student from Hyderabad", "form": json.dumps({})})

for message in example_prompt.messages:
    print(f"{message.type}: {message}")

system: content='You are an information extraction model for electronic medical records. \nYour task is to extract relevant medical and personal information from a given sentence and then interactively ask questions to fill in any missing fields. \nAlways follow these rules:\n1. First, extract as much information as possible from the initial sentence.\n2. Identify missing fields and ask the user questions to fill them in, one at a time.\n3. Ask questions in a polite and helpful manner.\n4. Continue asking questions until all fields are filled or the user declines to answer.\n5. Do not assume or hallucinate information not present in the sentence or user responses.\n6. Format the output as follows:\n\n{\n    "think": "[Your thought process and explanations]",\n    "info": {\n      "name": "extracted name",\n      "age": "extracted age",\n      "gender": "extracted or inferred gender",\n      "address": "extracted address",\n      "identity": "extracted identity",\n      "phone": "extrac

In [31]:
from langchain_core.globals import set_llm_cache
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.caches import InMemoryCache

llm = GoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=GOOGLE_API_KEY)

set_llm_cache(InMemoryCache())

In [84]:
print(llm.invoke(prompt.invoke({"text": "hey! I am Ishan! A student from Hyderabad", "form": json.dumps({})})))

```json
{
    "think": "Okay, I will extract the available information from the input sentence and then formulate a question to gather more details.",
    "info": {
      "name": "Ishan",
      "age": "",
      "gender": "Male",
      "address": "Hyderabad",
      "identity": "student",
      "phone": "",
      "problems": [],
      "conditions": [],
      "description": "[AI GENERATED] A male student from Hyderabad.",
      "recommended_doctor": "[AI GENERATED] General Practitioner"
    },
    "next_question": "Hi Ishan, nice to meet you! How old are you?"
}
```


In [87]:
x = {
    "think": "I will extract the information from the user's response and update the JSON with the provided details.",
    "info": {
      "name": "Ishan",
      "age": "22",
      "gender": "Male",
      "address": "Hyderabad",
      "identity": "student",
      "phone": "9014678452",
      "problems": [],
      "conditions": [],
      "description": "[AI GENERATED] A 22-year-old male student from Hyderabad.",
      "recommended_doctor": "[AI GENERATED] General Practitioner"
    },
    "next_question": "Could you please describe the problems you are facing?"
}
curr = json.dumps(x)
print(llm.invoke(prompt.invoke({"text": "I am facing headache and stomach pain since eating biryani yesterday!", "form": curr})))

Okay, I understand. I will continue the interaction based on the provided context.

**Current state:**

```json
{
    "think": "I will extract the information from the user's response and update the JSON with the provided details.",
    "info": {
        "name": "Ishan",
        "age": "22",
        "gender": "Male",
        "address": "Hyderabad",
        "identity": "student",
        "phone": "9014678452",
        "problems": [],
        "conditions": [],
        "description": "[AI GENERATED] A 22-year-old male student from Hyderabad.",
        "recommended_doctor": "[AI GENERATED] General Practitioner"
    },
    "next_question": "Could you please describe the problems you are facing?"
}
```

**User Input:** I am facing headache and stomach pain since eating biryani yesterday!

```json
{
    "think": "I will extract the information from the user's response and update the JSON with the provided details.",
    "info": {
        "name": "Ishan",
        "age": "22",
        "gender":

In [30]:
from typing_extensions import TypedDict


class State(TypedDict):
    question: str
    answer: str