In [1]:
# Define project information
# PROJECT_ID = "graphical-fort-306505" # @param {type:"string"}
# LOCATION = "us-central1" # @param {type:"string"}
PROJECT_ID = "viki-dev-app-wsky" # @param {type:"string"}
LOCATION = "us-east4" # @param {type:"string"}

# Initialize Vertex AI
import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)

2023-11-13 17:12:42.798919: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Utils
import time
from typing import List

# Langchain
import langchain
from pydantic import BaseModel

print(f"LangChain version: {langchain.__version__}")

# Vertex AI
from google.cloud import aiplatform
from langchain.chat_models import ChatVertexAI
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.schema import HumanMessage, SystemMessage

print(f"Vertex AI SDK version: {aiplatform.__version__}")

LangChain version: 0.0.229
Vertex AI SDK version: 1.33.1


In [3]:
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [4]:
# LLM model
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=1024,
    temperature=0.0,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Chat
chat = ChatVertexAI()

# Embedding
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

##Extracting text from pdf using cloud vision

In [5]:
import os
from google.cloud import vision
from google.cloud.vision_v1 import types

# adunai: we don't need this since we mount ~/.config/gcloud into container.
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/graphical-fort-306505-a3c9bca468bb.json"

# We need to update the gcp credential above with our own project API key for the Cloud vision.

In [6]:
transcript="""
"What is the your name?"

Patient - John Wick

"What is your date of birth?"

Patient - aug 10, 1955

 He takes Hydrocodone 10 mg orally every four hours  and also meperidine HCL 75 mg intramuscular every three hours

“Are you of Hispanic, Latino/a, or Spanish origin?” 

Patient – no  

“What is your race?” 

Patient- Asian

“What is your preferred language?” 

Patient- English 

“Do you need or want an interpreter to communicate with a doctor or health care staff?” 


Patient- no 

“Has lack of transportation kept you from medical appointments, meetings, work, or from getting things needed for daily living?” 

Patient- my son gets me everywhere I need to be 

Clinician- (B1300)  How often do you need to have someone help you when you read instructions, pamphlets, or other written material from your doctor or pharmacy? 

Patient response- Oh sometimes I need my son to help me understand the paperwork from the doctor.  

“BIMS” 

Clinician- (C0200)  

Repetition of Three Words Ask patient "I am going to say three words for you to remember. Please repeat the words after I have said all three. The words are: sock, blue, and bed. Now tell me the three words." 

Patient- sock, blue, bed 

(C0300) Temporal Orientation (Orientation to year, month, and day): 

 
"Please tell me what year it is right now" 

 
Patient – 2022 

"What month are we in right now?" 

Patient- September 

What day of the week is today?" 

Patient- Saturday 

(C0400) Recall: 

Clinician "Let's go back to an earlier question. What were those three words that I asked you to repeat?" If unable to remember a word, give cue (something to wear, a color, a piece of furniture) for that word. 


Patient- sock, blue, bed 

Upon completion of the BIMS- Clinician CAM assessment  

The patient has no evidence of acute change in mental status.  She is able to pay attention and think clearly.  The patient did not display any altered level of consciousness. 

Over the last 2 weeks, have you been bothered by any of the following problems?" 

“Little interest or pleasure in doing things” 

Patient- yes 

"About how often have you been bothered by this?” 

Patient- probably most every day 

“What about, Feeling down, depressed, or hopeless” 

Patient- MMM Sometimes 

"About how often have you been bothered by this?” 

Patient- probably just a couple days 

“ok, what about Trouble falling or staying asleep, or sleeping too much” 

Patient – no I sleep fine 

“Feeling tired or having little energy” 

Patient- oh yes for sure 

"About how often have you been bothered by this? 


Patient- I’m tired every day 

“Poor appetite or overeating”	 

Patient- nope I eat fine, can’t you tell, hahaha  

“Feeling bad about yourself - or that you are a failure or have let yourself or your family down” 

Patient- no  

“Trouble concentrating on things, such as reading the newspaper or watching television” 

Patient- no 

“Moving or speaking so slowly that other people could have noticed. Or the opposite - being so fidgety or restless that you have been moving around a lot more than usual” 

Patient- no 

“Thoughts that you would be better off dead, or of hurting yourself in some way” 

Patient- definitely no 
 
 (D0700) – “How often do you feel lonely or isolated from those around you?”  

Patient- Sometimes, I wish I got to see my family more often 
"""

In [7]:
transcript

'\n"What is the your name?"\n\nPatient - John Wick\n\n"What is your date of birth?"\n\nPatient - aug 10, 1955\n\n He takes Hydrocodone 10 mg orally every four hours  and also meperidine HCL 75 mg intramuscular every three hours\n\n“Are you of Hispanic, Latino/a, or Spanish origin?” \n\nPatient – no  \n\n“What is your race?” \n\nPatient- Asian\n\n“What is your preferred language?” \n\nPatient- English \n\n“Do you need or want an interpreter to communicate with a doctor or health care staff?” \n\n\nPatient- no \n\n“Has lack of transportation kept you from medical appointments, meetings, work, or from getting things needed for daily living?” \n\nPatient- my son gets me everywhere I need to be \n\nClinician- (B1300) \u202fHow often do you need to have someone help you when you read instructions, pamphlets, or other written material from your doctor or pharmacy? \n\nPatient response- Oh sometimes I need my son to help me understand the paperwork from the doctor.  \n\n“BIMS” \n\nClinician- (

In [14]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=0)
docs = text_splitter.split_text(transcript)
print(f"# of documents = {len(docs)}")

# of documents = 31


In [15]:
embeddings
docs

['"What is the your name?"\n\nPatient - John Wick\n\n"What is your date of birth?"\n\nPatient - aug 10, 1955',
 'He takes Hydrocodone 10 mg orally every four hours  and also meperidine HCL 75 mg intramuscular every three hours',
 '“Are you of Hispanic, Latino/a, or Spanish origin?” \n\nPatient – no  \n\n“What is your race?” \n\nPatient- Asian\n\n“What is your preferred language?”',
 'Patient- English \n\n“Do you need or want an interpreter to communicate with a doctor or health care staff?” \n\n\nPatient- no',
 '“Has lack of transportation kept you from medical appointments, meetings, work, or from getting things needed for daily living?”',
 'Patient- my son gets me everywhere I need to be',
 'Clinician- (B1300) \u202fHow often do you need to have someone help you when you read instructions, pamphlets, or other written material from your doctor',
 'or pharmacy?',
 'Patient response- Oh sometimes I need my son to help me understand the paperwork from the doctor.  \n\n“BIMS” \n\nClinicia

In [16]:
# Store docs in local vectorstore as index
# it may take a while since API is rate limited
from langchain.vectorstores import Chroma

db = Chroma.from_texts(docs, embeddings)

from langchain.vectorstores import MatchingEngine

db = 

# Sometimes we might get error for the panda version. We can ignore and try to re-run this command multiple times.

Waiting
......

In [17]:
# Expose index to the retriever
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [19]:
print(retriever.get_relevant_documents("Feeling down, depressed, or hopeless"))

[Document(page_content='“Feeling bad about yourself - or that you are a failure or have let yourself or your family down” \n\nPatient- no', metadata={}), Document(page_content='"About how often have you been bothered by this?” \n\nPatient- probably most every day \n\n“What about, Feeling down, depressed, or hopeless”', metadata={})]


In [12]:

# Create chain to answer questions
from langchain.chains import RetrievalQA

# Uses LLM to synthesize results from the search index.
# We use Vertex PaLM Text API for LLM
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

In [13]:
import os
import json

from google.cloud import aiplatform
import tensorflow_hub as hub
import tensorflow_text

In [14]:
#module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
#model = hub.load(module_url)
# Create a folder for the TF hub module.
#!mkdir /tmp/moduleA
# Download the module, and uncompress it to the destination folder. You might want to do this manually.
#!curl -L "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed" | tar -zxvC /tmp/moduleA
# Test to make sure it works.
#import tensorflow_hub as hub
#hub.Module("/tmp/moduleA")
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
#Load the model(here we use minilm)
model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_embeddings_sentence = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
emb2 = sentence_embeddings_sentence.embed_documents(["This is not a test document."])
emb1 = model.encode("hello")

In [15]:
initial_config = {
    "id": "banana_id",
    "embedding": emb2,
}

In [16]:
initial_config

{'id': 'banana_id',
 'embedding': [-0.06277170777320862,
  0.05495879426598549,
  0.05216483399271965,
  0.08579001575708389,
  -0.0827488899230957,
  -0.07457300275564194,
  0.06855475157499313,
  0.018396364524960518,
  -0.08201134204864502,
  -0.037384819239377975,
  0.012124882079660892,
  0.003518301760777831,
  -0.004134269431233406,
  -0.04378444701433182,
  0.021807271987199783,
  -0.005102747585624456,
  0.019546594470739365,
  -0.04234872758388519,
  -0.11035966873168945,
  0.0054245395585894585,
  -0.05573470890522003,
  0.02805245853960514,
  -0.023158662021160126,
  0.02848135121166706,
  -0.05370962247252464,
  -0.052601560950279236,
  0.033939260989427567,
  0.045388638973236084,
  0.023718398064374924,
  -0.0731208324432373,
  0.054777760058641434,
  0.01704731211066246,
  0.0813603326678276,
  -0.0028627156279981136,
  0.011958093382418156,
  0.07355857640504837,
  -0.09423740208148956,
  -0.0813620388507843,
  0.04001539200544357,
  0.0006921731401234865,
  -0.0133933

In [10]:
import json
with open("data.json", "w") as f:
    json.dump(initial_config, f)
!gsutil cp "data.json" "gs://cloud-ai-platform-1d61b8cd-e7f6-4bb1-b676-5c1b800a4d4b/matchingengine/"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Copying file://data.json [Content-Type=application/json]...
/ [1 files][  8.3 KiB/  8.3 KiB]                                                
Operation completed over 1 objects/8.3 KiB.                                      


In [19]:
#my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
#    display_name="Clinical Saver Matching Index",
#    contents_delta_uri="gs://cloud-ai-platform-1d61b8cd-e7f6-4bb1-b676-5c1b800a4d4b/matchingengine/",
#    dimensions=384,
#    approximate_neighbors_count=150,
#    distance_measure_type="DOT_PRODUCT_DISTANCE",
#)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/145042810266/locations/us-central1/indexes/3695348629792358400/operations/8390331931153137664


KeyboardInterrupt: 

In [18]:
#my_index_endpoint = my_index_endpoint.deploy_index(
#    index=my_index, deployed_index_id="4000467504546709504"
#)

#my_index_endpoint.deployed_indexes

NameError: name 'my_index_endpoint' is not defined

In [15]:

from langchain.vectorstores import MatchingEngine
vector_store = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region="us-central1",
    gcs_bucket_name="cloud-ai-platform-1d61b8cd-e7f6-4bb1-b676-5c1b800a4d4b",
    index_id="3070051969029636096",
    endpoint_id="4000467504546709504",
    embedding=sentence_embeddings_sentence
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [33]:
vector_store.add_texts(texts=docs)



Updating MatchingEngineIndex index: projects/145042810266/locations/us-central1/indexes/3070051969029636096
Update MatchingEngineIndex index backing LRO: projects/145042810266/locations/us-central1/indexes/3070051969029636096/operations/4576486734212104192
MatchingEngineIndex index Updated. Resource name: projects/145042810266/locations/us-central1/indexes/3070051969029636096


['061510e6-cde7-40bf-91e1-209722727614',
 'ad1cc54b-1bcd-48a0-8f3a-d632a727ce84',
 'f543ff34-7b0e-4864-adaf-5794a343347e']

In [16]:
result = vector_store.similarity_search("John Wick", k=2)

_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "DNS resolution failed for :10000: unparseable host:port"
	debug_error_string = "UNKNOWN:DNS resolution failed for :10000: unparseable host:port {created_time:"2023-11-03T10:03:54.036686-05:00", grpc_status:14}"
>

In [22]:
query = """
Give original extract of Medication List
"""
result = qa({"query": query})
print(result)
print(result.get("result"))
result = result.get("result")

{'query': '\nGive original extract of Medication List\n', 'result': 'The patient takes Hydrocodone 10 mg orally every four hours and also meperidine HCL 75 mg intramuscular every three hours.', 'source_documents': [Document(page_content='"What is the your name?"\n\nPatient - John Wick\n\n"What is your date of birth?"\n\nPatient - aug 10, 1955\n\n He takes Hydrocodone 10 mg orally every four hours  and also meperidine HCL 75 mg intramuscular every three hours\n\n“Are you of Hispanic, Latino/a, or Spanish origin?” \n\nPatient – no  \n\n“What is your race?” \n\nPatient- Asian\n\n“What is your preferred language?” \n\nPatient- English \n\n“Do you need or want an interpreter to communicate with a doctor or health care staff?” \n\n\nPatient- no \n\n“Has lack of transportation kept you from medical appointments, meetings, work, or from getting things needed for daily living?” \n\nPatient- my son gets me everywhere I need to be \n\nClinician- (B1300) \u202fHow often do you need to have someone

In [53]:
json_schema={
"patient_name": "",
"date_of_birth": "",
"race": "",
"patient_allergies": "",
"section_2225": [
{
"input_medication_name": "",
"input_consumption_type": "",
"input_dosage": "",
"input_frequency": "",
"boolean_checkbox_1643": ""
},
{
"input_medication_name": "",
"input_consumption_type": "",
"input_dosage": "",
"input_frequency": "",
"boolean_checkbox_1643": ""
}
],
"section_patient_mood": {
"patient_mood_sleep_label": "",
"patient_mood_sleep_score": "",
"patient_mood_sleep_frequency": "",
"patient_mood_interest_label": "",
"patient_mood_interest_symptom_presence": "",
"patient_mood_interest_symptom_frequency": "",
"depressed_feeling_label": "",
"patient_mood_depression_score": "",
"patient_mood_depression_frequency": "",
"patient_mood_tired_label": "",
"patient_mood_tired_symptom_frequency": "",
"patient_mood_tired_symptom_presence": "",
"header_symptom_presence": "",
"header_symptom_frequency": "",
"readonly_1698259191": ""
}
}

In [60]:
original_text_json_schema={
"patient_name_source_text": "",
"date_of_birth_source_text": "",
"race_source_text": "",
"patient_allergies_source_text": "",
"section_2225": [
{
"input_medication_name": "",
"input_consumption_type": "",
"input_dosage": "",
"input_frequency": "",
"boolean_checkbox_1643": ""
},
{
"input_medication_name": "",
"input_consumption_type": "",
"input_dosage": "",
"input_frequency": "",
"boolean_checkbox_1643": ""
}
],
"section_patient_mood": {
"patient_mood_sleep_label": "",
"patient_mood_sleep_score_source_text": "",
"patient_mood_sleep_frequency_source_text": "",
"patient_mood_interest_label": "",
"patient_mood_interest_symptom_presence_source_text": "",
"patient_mood_interest_symptom_frequency_source_text": "",
"depressed_feeling_label": "",
"patient_mood_depression_score": "",
"patient_mood_depression_frequency": "",
"patient_mood_tired_label": "",
"patient_mood_tired_symptom_frequency": "",
"patient_mood_tired_symptom_presence": "",
"header_symptom_presence": "",
"header_symptom_frequency": "",
"readonly_1698259191": ""
}
}

In [61]:
query = """
You are medical transcriber.Your job is extract relevant content and return a json response. Use the JSON template below. Please make sure new key's are not added
    inthe json schema. if its array datatype, you can create more than one items in the array. Also any date value to 
    use yyyy-mm-dd format. 
    {
"patient_name": "",
"date_of_birth": "",
"race": "",
"patient_allergies": null,
"section_2225": [
{
"input_medication_name": "",
"input_consumption_type": "",
"input_dosage": "",
"input_frequency": "",
"boolean_checkbox_1643": null
},
{
"input_medication_name": "",
"input_consumption_type": "",
"input_dosage": "",
"input_frequency": "",
"boolean_checkbox_1643": null
}
],
"section_patient_mood": {
"patient_mood_sleep_label": "",
"patient_mood_sleep_score": "",
"patient_mood_sleep_frequency": null,
"patient_mood_interest_label": "",
"patient_mood_interest_symptom_presence": "",
"patient_mood_interest_symptom_frequency": "",
"depressed_feeling_label": "",
"patient_mood_depression_score": "",
"patient_mood_depression_frequency": "",
"patient_mood_tired_label": "",
"patient_mood_tired_symptom_frequency": "",
"patient_mood_tired_symptom_presence": "",
"header_symptom_presence": null,
"header_symptom_frequency": null,
"readonly_1698259191": null
}
}
    use below mechanism to calculate symptom presence:
    0. No (enter 0 for sympton frequency)
    1. Yes (enter 0-3 for symptom frequency)
    9. No Response
    use below instructions to calculate symptom frequency:
    0. Never or 1 day
    1. 2-6 days (several days)
    2. 7-11 days (half or more of the days)
    3. 12-14 days (nearly every day)
"""
result = qa({"query": query})
print(result)
print(result.get("result"))
result = result.get("result")

{
"patient_name": "John Wick",
"date_of_birth": "1955-08-10",
"race": "Asian",
"patient_allergies": null,
"section_2225": [
{
"input_medication_name": "Hydrocodone 10 mg",
"input_consumption_type": "Oral",
"input_dosage": "10 mg",
"input_frequency": "Every 4 hours",
"boolean_checkbox_1643": null
},
{
"input_medication_name": "Meperidine HCL 75 mg",
"input_consumption_type": "Intramuscular",
"input_dosage": "75 mg",
"input_frequency": "Every 3 hours",
"boolean_checkbox_1643": null
}
],
"section_patient_mood": {
"patient_mood_sleep_label": "Trouble falling or staying asleep, or sleeping too much",
"patient_mood_sleep_score": 0,
"patient_mood_sleep_frequency": 0,
"patient_mood_interest_label": "Little interest or pleasure in doing things",
"patient_mood_interest_symptom_presence": 1,
"patient_mood_interest_symptom_frequency": 3,
"depressed_feeling_label": "Feeling down, depressed, or hopeless",
"patient_mood_depression_score": 1,
"patient_mood_depression_frequency": 2,
"patient_mood_tired

In [62]:
query2 = f"""
Please give source text corresponding to the each field in {result} in same format as
{original_text_json_schema}
"""
result2 = qa({"query": query2})
print(result2.get("result"))

{'patient_name_source_text': 'John Wick',
'date_of_birth_source_text': 'aug 10, 1955',
'race_source_text': 'Asian',
'patient_allergies_source_text': '',
'section_2225': [{'input_medication_name': 'Hydrocodone 10 mg',
'input_consumption_type': 'Oral',
'input_dosage': '10 mg',
'input_frequency': 'Every 4 hours',
'boolean_checkbox_1643': null},
{'input_medication_name': 'Meperidine HCL 75 mg',
'input_consumption_type': 'Intramuscular',
'input_dosage': '75 mg',
'input_frequency': 'Every 3 hours',
'boolean_checkbox_1643': null}],
'section_patient_mood': {'patient_mood_sleep_label': 'Trouble falling or staying asleep, or sleeping too much',
'patient_mood_sleep_score_source_text': '0',
'patient_mood_sleep_frequency_source_text': '0',
'patient_mood_interest_label': 'Little interest or pleasure in doing things',
'patient_mood_interest_symptom_presence_source_text': '1',
'patient_mood_interest_symptom_frequency_source_text': '3',
'depressed_feeling_label': 'Feeling down, depressed, or hopeless',