<a href="https://colab.research.google.com/github/nguyen1oc/RAG_Tutorial/blob/main/RAG_TUTORIAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#INSTALL LIBRARIES

In [1]:
!pip install datasets transformers



IMPORT LIBRARIES

In [2]:
import datasets

In [3]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

#DATASET

link: https://www.kaggle.com/datasets/chaitanyakck/medical-text

In [4]:
  !pip install langchain



In [5]:
with open("train.txt", "r") as f:
  data = f.read()

In [6]:
data[:100]

'4\tCatheterization laboratory events and hospital outcome with direct angioplasty for acute myocardia'

In [7]:
from langchain.docstore.document import Document as LangChainDocument

In [8]:
raw_database = LangChainDocument(page_content=data)

In [9]:
MARKDOWN_SEPARATORS = [
     "\n#{1,6}",
     "```\n",
     "\n\\*\\*\\*+\n",
     "\n---+\n",
     "\n___+\n",
     "\n\n",
     "\n",
     " ",
     "",
]

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [11]:
data[:1000]

'4\tCatheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction To assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries. The groups of patients were similar for age (left anterior descending coronary artery, 59 years; right coronary artery, 58 years; circumflex coronary artery, 62 years), patients with multivessel disease (left anterior descending coronary artery, 55%; right coronary artery, 55%; circumflex coronary artery, 64%), and patients with initial grade 0/1 antegrade flow (left anterior descending coronary artery, 79%; right coronary artery, 84%; circumflex coronary artery, 90%). Cardiogenic shock was present in eight patients with infarction of the left anterior des

In [12]:
data[1000:2000]

'nding coronary artery, four with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery. Major catheterization laboratory events (cardioversion, cardiopulmonary resuscitation, dopamine or intra-aortic balloon pump support for hypotension, and urgent surgery) occurred in 10 patients with infarction of the left anterior descending coronary artery, eight with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery (16 of 16 shock and six of 234 nonshock patients, p less than 0.001). There was one in-laboratory death (shock patient with infarction of the left anterior descending coronary artery). \n5\tRenal abscess in children. Three cases of renal abscesses in children are described to illustrate the variable presenting features. An additional 23 pediatric cases, reported over the past ten years, were reviewed for clinical features and therapy. Fever, loin pain, and leukocytosis were common presentin

In [13]:
splitter = RecursiveCharacterTextSplitter(
    separators=MARKDOWN_SEPARATORS,
    chunk_size=1000,
    chunk_overlap=100,
    )

In [14]:
processed_data= splitter.split_documents([raw_database])

In [15]:
processed_data[0]

Document(metadata={}, page_content='4\tCatheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction To assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries. The groups of patients were similar for age (left anterior descending coronary artery, 59 years; right coronary artery, 58 years; circumflex coronary artery, 62 years), patients with multivessel disease (left anterior descending coronary artery, 55%; right coronary artery, 55%; circumflex coronary artery, 64%), and patients with initial grade 0/1 antegrade flow (left anterior descending coronary artery, 79%; right coronary artery, 84%; circumflex coronary artery, 90%). Cardiogenic shock was present in eight patients with 

In [16]:
processed_data[1:5]

[Document(metadata={}, page_content='artery, 90%). Cardiogenic shock was present in eight patients with infarction of the left anterior descending coronary artery, four with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery. Major catheterization laboratory events (cardioversion, cardiopulmonary resuscitation, dopamine or intra-aortic balloon pump support for hypotension, and urgent surgery) occurred in 10 patients with infarction of the left anterior descending coronary artery, eight with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery (16 of 16 shock and six of 234 nonshock patients, p less than 0.001). There was one in-laboratory death (shock patient with infarction of the left anterior descending coronary artery).'),
 Document(metadata={}, page_content='5\tRenal abscess in children. Three cases of renal abscesses in children are described to illustrate the variable presenting featu

In [17]:
!pip install langchain_community #allow to import huggingface embedding
!pip install sentence-transformers #encoding and processing sentences and text (specific type of models)



#TOKENIZING/VECTORIZING


In [18]:
from langchain_community.embeddings import HuggingFaceEmbeddings
model_name = "thenlper/gte-small"

In [19]:
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda"},
    multi_process=True,
    encode_kwargs={"normalize_embeddings": True},
)


  embedding_model = HuggingFaceEmbeddings(


In [20]:
len(embedding_model.embed_query("Hey i am testing this and check how it work :)"))

384

In [21]:
!pip install pinecone



#STORING DATASET IN VECTOR DB

Using: Pinecone

In [22]:
from pinecone import Pinecone

pc = Pinecone(api_key="<PINECONE_API>")
index = pc.Index("lab-testing-rag")

In [23]:
data_to_add = []

for i, entry in tqdm(enumerate(processed_data[:5])):
  text = entry.page_content
  vector =  embedding_model.embed_query(text)
  data_to_add.append({
      "id": "vec_{}".format(i),
      "values": vector,
      "metadata":{"text":text}
  })

0it [00:00, ?it/s]

In [24]:
index.upsert(data_to_add, namespace="ns1")

{'upserted_count': 5}

#LOADING A LLM

In [30]:
!pip install -q -U google-generativeai

import google.generativeai as genai
from google.colab import userdata

In [34]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch it from an environment variable
GOOGLE_API_KEY = userdata.get('API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel('gemini-1.5-flash-latest')

In [39]:
def rag_query(query, index, embedding_model, genai_model, namespace="ns1"):
  query_vector = embedding_model.embed_query(query)
  search_results = index.query(
      vector=query_vector,
      top_k=3,
      include_metadata=True,
      namespace=namespace)

  context = "\n".join([match['metadata']['text'] for match in search_results['matches']])

  prompt = f"Given the following context:\n{context}\n\nAnswer the following question:\n{query}"

  response = genai_model.generate_content(prompt)

  return response.text

In [38]:
# Example usage:
user_query = "What is Cardiogenic shock?"
response = rag_query(user_query, index, embedding_model, model)
response

'Based on the provided text, cardiogenic shock is a condition present in some patients with acute myocardial infarction (heart attack).  The text shows it occurred in a subset of patients with infarctions in different coronary arteries (left anterior descending, right coronary artery, and circumflex coronary artery), and was associated with a significantly higher rate of major catheterization laboratory events (like cardioversion, CPR, and the use of devices to support blood pressure).  The text does not give a formal definition but clearly implies it is a serious complication of heart attack involving circulatory failure.\n'