In [22]:
import os
import json
import time
import spacy
from jsonschema import validate
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import Settings

In [2]:
GOOGLE_API_KEY = "AIzaSyCCEdgqiDAwkJO9c8UzxtVYK48-N0uW2qs"  # add your GOOGLE API key here
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [3]:
generation_config = {
  "candidate_count": 1,
  "max_output_tokens": 256,
  "temperature": 0.2
}

safety_settings=[
  {
    "category": "HARM_CATEGORY_DANGEROUS",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE",
  },
]

llm = Gemini(model_name="models/gemini-1.0-pro-latest", 
             generation_config=generation_config, safety_settings=safety_settings)

In [4]:
import pandas as pd
doc_df = pd.read_csv("./data/gpt-4.csv")
doc_df.head()

Unnamed: 0,data,conversation
0,This 60-year-old male was hospitalized due to ...,"Doctor: Good morning, how are you feeling toda..."
1,A 39-year-old man was hospitalized due to an i...,"Doctor: Hello, how are you feeling today?\nPat..."
2,One week after a positive COVID-19 result this...,"Doctor: Hello, how are you feeling today?\nPat..."
3,This 69-year-old male was admitted to the ICU ...,"Doctor: Hello, how are you feeling today?\nPat..."
4,This 57-year-old male was admitted to the ICU ...,"Doctor: Good morning, how are you feeling toda..."


In [8]:
from llama_index.core.prompts import PromptTemplate
prompt = PromptTemplate("""Given the following conversation summary between a doctor and a patient, extract the relevant information and format it as a JSON object with the specified keys. If any information is not present in the conversation, set the corresponding value to null. Keep the extracted phrases concise, using less than 3 words each. Avoid repeating information.

JSON keys:
- symptoms: list of strings
- diseases: list of strings
- medical_tests_prescribed: list of strings
- precautions_advised: list of strings
- medicines_prescribed: list of strings

Conversation summary: {topic}""")

In [18]:
schema = {
    "type": "object",
    "properties": {
        "symptoms": {
            "type": "array",
            "items": {"type": "string"}
        },
        "diseases": {
            "type": "array",
            "items": {"type": "string"}
        },
        "medical_tests_prescribed": {
            "type": "array",
            "items": {"type": "string"}
        },
        "precautions_advised": {
            "type": "array",
            "items": {"type": "string"}
        },
        "medicines_prescribed": {
            "type": "array",
            "items": {"type": "string"}
        }
    }
}

In [25]:
def preprocess_text(text):
    # Remove the leading and trailing backticks and "json" string
    string = text.strip('`').replace('json', '')

    # Unescape the newline characters
    string = string.replace('\\n', '\n')

    # Parse the cleaned string to JSON
    parsed_json = None
    try:
        parsed_json = json.loads(string)
        validate(instance=parsed_json, schema=schema)
    except Exception:
        print("Invalid JSON object")
        parsed_json = None
    return parsed_json

In [21]:
responses = []

In [24]:
len(responses)

67

In [26]:
# Iterate over the rows of the dataframe
for index, row in doc_df.head(1000)[100:].iterrows():
    # Get the document text
    text = row["data"]
    # Generate the response
    response = None
    try:
        response = llm.predict(prompt, topic=text)
    except Exception as e:
        print("Something went wrong", e)
        time.sleep(2)
        continue
    
    # Preprocess the response
    parsed_json = preprocess_text(response)
    if parsed_json is not None:
        responses.append(parsed_json)

Invalid JSON object
Something went wrong block_reason: OTHER
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Something went wrong Response was terminated early: MAX_TOKENS
Invalid JSON object
Something went wrong Response was ter

In [27]:
len(responses)

940

In [35]:
responses[780]

{'symptoms': ['gross hematuria', 'dysuria'],
 'diseases': ['avascular necrosis',
  'panniculitis',
  'diffuse systemic sclerosis',
  'pulmonary fibrosis'],
 'medical_tests_prescribed': ['urinalysis',
  'urine culture',
  'ultrasonography of the kidney, ureter, and bladder system',
  'cystoscopy',
  'hematoxylin and eosin stain',
  'stain of von Kossa'],
 'precautions_advised': [],
 'medicines_prescribed': ['methotrexate',
  'prednisolone',
  'cyclophosphamide',
  'ciprofloxacin',
  'trimethoprimâ€“sulfamethoxazole']}

In [37]:
# Save the responses to a JSON file
with open("gpt-4-responses.json", "w") as f:
    json.dump(responses, f, indent=4)