<a href="https://colab.research.google.com/github/pfunk5150/rag-from-scratch/blob/main/Insurance_Plan_Doc_RAG_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install langchain --upgrade --quiet
!pip install langchain_anthropic
!pip install langchain-community
!pip install chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.6/990.6 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m379.9/379.9 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.1/140.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.1/141.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_anthropic
  Downloading langchain_anthropic-0.1.22-py3-none-any.whl.metadata (2.2 kB)
Collecting anthropic<1,>=0.28.0 (from langchain_anthropic)
  Downloading anthropic-0.32.0-py3-none-any.whl.metadata (18 kB)
Collecting httpx<1,>=0.23.0 (from anthropic<1,>=0.28.0->langchain_anthropic)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>

In [2]:
!pip install --upgrade pydantic grpcio

Collecting pydantic
  Using cached pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
Collecting grpcio
  Downloading grpcio-1.65.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Using cached pydantic-2.8.2-py3-none-any.whl (423 kB)
Downloading grpcio-1.65.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: grpcio, pydantic
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.57.0
    Uninstalling grpcio-1.57.0:
      Successfully uninstalled grpcio-1.57.0
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.17
    Uninstalling pydantic-1.10.17:
      Successfully uninstalled pydantic-1.10.17
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the foll

In [4]:
!pip uninstall jina -y

Found existing installation: jina 3.27.2
Uninstalling jina-3.27.2:
  Successfully uninstalled jina-3.27.2


In [4]:
!pip install opentelemetry-instrumentation-fastapi>=0.41b0

In [6]:
# Insurance Document RAG System

## 1. Import Statements: Summoning the Guardians

import os
from langchain.embeddings import JinaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_anthropic import ChatAnthropic
from google.colab import drive


## 2. API Keys: Unveiling the Secrets

from google.colab import userdata
# Retrieve API keys from user data (assuming you've stored them previously)
jina_api_key = userdata.get('JINA_AUTH_TOKEN') # Retrieve the key here
anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')

# Use Colab's secrets management for enhanced security:
# If the keys are not found in userdata, try getting them from environment variables
if jina_api_key is None:
    jina_api_key = os.environ.get('JINA_AUTH_TOKEN')
if anthropic_api_key is None:
    anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY')

# Check if the keys were retrieved successfully
if not jina_api_key:
    raise ValueError("JINA_AUTH_TOKEN not found in user data. Please store it using userdata.set().")

if not anthropic_api_key:
    raise ValueError("ANTHROPIC_API_KEY not found in user data. Please store it using userdata.set().")


## 3. Document Loading: Bridging Realms

# Mount your Google Drive
drive.mount('/content/drive')

# Define the path to your insurance document (replace with your actual path)
document_path = '/content/drive/MyDrive/RAG Documents/unitedHealthcareChoicePlusPlanDoc.md'

# Open and read the document
with open(document_path, 'r') as f:
    insurance_text = f.read()


## 4. Text Splitting: Fragmenting for Comprehension

# Create the text splitter (adjust chunk_size and overlap as needed)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=8000,
    chunk_overlap=200
)

# Split the document into chunks
text_chunks = text_splitter.split_text(insurance_text)


## 5. Embeddings Generation: Weaving Meaning into Data

# Instantiate the Jina embedding model
embeddings_model = JinaEmbeddings(jina_api_key=jina_api_key)

# Generate embeddings for the text chunks
document_embeddings = embeddings_model.embed_documents(text_chunks)


## 6. ChromaDB Creation: Crafting the Celestial Map

# Create the ChromaDB vector database
embedding_db = Chroma.from_texts(
    texts=text_chunks,
    embedding=embeddings_model
)

# Verification:
print(f"Documents in database: {embedding_db._collection.count()}")


## 7. QA Chain Setup: Summoning the Question-Answering Guide

# Instantiate the Anthropic language model
llm = ChatAnthropic(
    anthropic_api_key=anthropic_api_key,
    model_name="claude-3-5-sonnet-20240620"  # Replace with the desired Anthropic model
)

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=embedding_db.as_retriever()
)

# Define the user-facing function for asking questions
def ask_the_document(query):
    """
    Queries the document using the RAG pipeline.

    Args:
        query (str): Your question about the insurance document.

    Returns:
        str: The answer extracted from the document.
    """
    # Use qa_chain.invoke instead of qa_chain(...) to avoid deprecation warning
    result = qa_chain.invoke({"query": query})
    return result["result"]


## 8. Example Query: Engaging with Knowledge

query = "What are the limitations on out-of-network mental health treatment coverage?"
answer = ask_the_document(query)
print(f"Query: {query}\nAnswer: {answer}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Documents in database: 130
Query: What are the limitations on out-of-network mental health treatment coverage?
Answer: Based on the policy information provided, there are a few key points regarding out-of-network mental health treatment coverage:

1. The policy does offer Out-of-Network Benefits, but the Schedule of Benefits will specify how these apply and what the limitations are. 

2. For out-of-network providers, the "Recognized Amount" is used to determine cost-sharing (copayments, coinsurance, deductibles). This is based on:
   - An All Payer Model Agreement if adopted
   - State law
   - The lesser of the qualifying payment amount or the amount billed by the provider

3. Mental Health Care Services are covered for the diagnosis and treatment of mental health conditions listed in current diagnostic manuals. However, just because a condition is listed do

In [20]:
persist_directory = '/content/drive/My Drive/my_chroma_db'  # Adjust the path as needed
embedding_db = Chroma.from_texts(
    texts=text_chunks,
    embedding=embeddings_model,
    persist_directory=persist_directory
)

print(f"Chroma database created at: {persist_directory}")

RuntimeError: Your API key has run out of its token-quota. Please top up your key or provide another one with sufficient balance.

In [8]:
# Get all document IDs from the Chroma collection
all_ids = embedding_db._collection.get()['ids']

# Check for duplicates
if len(all_ids) != len(set(all_ids)):
    print("Potential duplicate entries found in the Chroma database.")

    # Option 1: Remove duplicates (if you're sure you want to discard them)
    unique_ids = list(set(all_ids))
    embedding_db._collection.delete(ids=[id for id in all_ids if id not in unique_ids])
    print("Duplicate entries removed.")

    # Option 2: Investigate further (to understand the nature of duplicates)
    from collections import Counter
    id_counts = Counter(all_ids)
    duplicates = [id for id, count in id_counts.items() if count > 1]
    print("Duplicate IDs:", duplicates)
    # You can then retrieve the corresponding documents using embedding_db._collection.get(ids=duplicates)
    # to examine them and decide how to handle them.

else:
    print("No duplicate entries found in the Chroma database.")

No duplicate entries found in the Chroma database.


In [12]:
all_ids = embedding_db._collection.get()['ids']
for i in range(5):  # Print the first 5 documents
    doc = embedding_db._collection.get(ids=[all_ids[i]])['documents'][0]
    print(f"ID: {all_ids[i]}, Document: {doc[:100]}...")  # Print a snippet of the document

ID: 01babc45-face-48b5-985b-00ad8acc5bec, Document: Disability or Disabled - a Subscriber's inability to perform all of the substantial and material dut...
ID: 01e2ff26-0909-492d-a0f2-cdf6e38472d7, Document: We may collect, use, and disclose your health information for the following purposes under limited c...
ID: 032b6268-154e-4d07-94ac-93400c6a0360, Document: • Ovulation induction (or controlled ovarian stimulation).

• Insemination procedures (artificial in...
ID: 033fd99e-128d-4ea3-9271-6414bae34622, Document: Q. Transplants.........................................................................................
ID: 03e83b7b-9d82-4564-ac98-2cee565e0c1b, Document: Title:

URL Source: https://cvws.icloud-content.com/B/AX6_5_Efi4Nr90m_c9u_Z-ij1vtyAYkGZAGRxIFlzy5CwZ...


In [13]:
all_ids = embedding_db._collection.get()['ids']
for i in range(5):  # Print the first 5 documents
    try:
        doc = embedding_db._collection.get(ids=[all_ids[i]])['documents'][0]
        print(f"ID: {all_ids[i]}, Document: {doc[:100]}...")  # Print a snippet of the document
    except Exception as e:
        print(f"Error processing document with ID {all_ids[i]}: {e}")

ID: 01babc45-face-48b5-985b-00ad8acc5bec, Document: Disability or Disabled - a Subscriber's inability to perform all of the substantial and material dut...
ID: 01e2ff26-0909-492d-a0f2-cdf6e38472d7, Document: We may collect, use, and disclose your health information for the following purposes under limited c...
ID: 032b6268-154e-4d07-94ac-93400c6a0360, Document: • Ovulation induction (or controlled ovarian stimulation).

• Insemination procedures (artificial in...
ID: 033fd99e-128d-4ea3-9271-6414bae34622, Document: Q. Transplants.........................................................................................
ID: 03e83b7b-9d82-4564-ac98-2cee565e0c1b, Document: Title:

URL Source: https://cvws.icloud-content.com/B/AX6_5_Efi4Nr90m_c9u_Z-ij1vtyAYkGZAGRxIFlzy5CwZ...


In [14]:
all_ids = embedding_db._collection.get()['ids']
for i in range(5):  # Try with 5 documents again
    try:
        doc = embedding_db._collection.get(ids=[all_ids[i]])['documents'][0]
        print(f"ID: {all_ids[i]}, Document Length: {len(doc)}")
    except Exception as e:
        print(f"Error processing document with ID {all_ids[i]}: {e}")

ID: 01babc45-face-48b5-985b-00ad8acc5bec, Document Length: 7914
ID: 01e2ff26-0909-492d-a0f2-cdf6e38472d7, Document Length: 7895
ID: 032b6268-154e-4d07-94ac-93400c6a0360, Document Length: 7936
ID: 033fd99e-128d-4ea3-9271-6414bae34622, Document Length: 7931
ID: 03e83b7b-9d82-4564-ac98-2cee565e0c1b, Document Length: 7980


In [15]:
import hashlib

def hash_document(text):
    """Generates a hash of the document text."""
    return hashlib.md5(text.encode()).hexdigest()

all_ids = embedding_db._collection.get()['ids']
document_hashes = {}

for id in all_ids:
    try:
        doc = embedding_db._collection.get(ids=[id])['documents'][0]
        doc_hash = hash_document(doc)
        if doc_hash in document_hashes:
            print(f"Duplicate found! ID: {id}, matches ID: {document_hashes[doc_hash]}")
        else:
            document_hashes[doc_hash] = id
    except Exception as e:
        print(f"Error processing document with ID {id}: {e}")

if not any(count > 1 for count in document_hashes.values()):
    print("No duplicate documents found.")

Duplicate found! ID: 30fafc6a-e5e4-4bce-93bb-def124c64a9b, matches ID: 2da3c390-c21c-418d-994b-ff87e599186a
Duplicate found! ID: 31c8aff6-0d09-4f87-bd0f-2c840b417db5, matches ID: 032b6268-154e-4d07-94ac-93400c6a0360
Duplicate found! ID: 3cce407e-66bf-43f2-95ee-23cd93f17f8c, matches ID: 197e4c01-4c2a-426a-8c78-e27870e81477
Duplicate found! ID: 3f847c1d-1f1b-4998-8b33-585c3f98542c, matches ID: 16fbc722-d8a7-4db7-bf14-26b6848e484f
Duplicate found! ID: 467b5273-39e4-4c71-ab59-a0ffebfdff56, matches ID: 370ddbde-538f-4e7c-9194-de6f885ec998
Duplicate found! ID: 5025f513-e38e-42c3-a94d-3f75e0562929, matches ID: 09256273-3d3a-4636-8a8d-c29222dd7a91
Duplicate found! ID: 5f0ffdeb-7f6d-453b-87ab-477650a223c7, matches ID: 53b72cd8-ba66-46cd-ba9b-4eadb87f2836
Duplicate found! ID: 61dc29a9-55d9-4904-a002-d0b8c0a3fd63, matches ID: 09f2aa0a-e1b2-47a7-bbc0-d55a9641d920
Duplicate found! ID: 631dd8a5-1684-4696-9d96-67878abf5388, matches ID: 53dd349f-2c54-43af-bf79-4395518d7dec
Duplicate found! ID: 67cd782

TypeError: '>' not supported between instances of 'str' and 'int'

In [18]:
import hashlib

def hash_document(text):
    """Generates a hash of the document text."""
    return hashlib.md5(text.encode()).hexdigest()

all_ids = embedding_db._collection.get()['ids']
document_hashes = {}

for id in all_ids:
    try:
        doc = embedding_db._collection.get(ids=[id])['documents'][0]
        doc_hash = hash_document(doc)
        if doc_hash in document_hashes:
            print(f"Duplicate found! ID: {id}, matches ID: {document_hashes[doc_hash]}")
            print("-" * 30)
            print(f"Document with ID {id}:")
            print(doc)  # Print the full content of the duplicate document
            print("-" * 30)
            print(f"Document with ID {document_hashes[doc_hash]}:")
            matching_doc = embedding_db._collection.get(ids=[document_hashes[doc_hash]])['documents'][0]
            print(matching_doc)  # Print the full content of the matching document
            print("=" * 50)
        else:
            document_hashes[doc_hash] = id
    except Exception as e:
        print(f"Error processing document with ID {id}: {e}")

Duplicate found! ID: 30fafc6a-e5e4-4bce-93bb-def124c64a9b, matches ID: 2da3c390-c21c-418d-994b-ff87e599186a
------------------------------
Document with ID 30fafc6a-e5e4-4bce-93bb-def124c64a9b:
Benefits are limited to a single purchase of each type of prosthetic device every three years. Repair and/or replacement of a prosthetic device would apply to this limit in the same manner as a purchase.

Once this limit is reached, Benefits continue to be available for items required by the Women's Health and Cancer Rights Act of 1998 and for prosthetic arms, legs, feet and hands.

Network

20% Yes Yes

Out-of-Network

50% except that the Benefit for prosthetic arms, legs, feet and hands is 20%.

Yes Yes

29. Reconstructive Procedures

Prior Authorization Requirement

For Out-of-Network Benefits, you must obtain prior authorization five business days before a scheduled reconstructive procedure is performed or, for non-scheduled procedures, within one business day or as soon as is reasonably pos

In [19]:
import os
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")
chroma_directory = os.path.join(current_directory, ".chroma")
if os.path.exists(chroma_directory):
    print(f"Chroma database found at: {chroma_directory}")
else:
    print("Chroma database not found in the default location.")

Current working directory: /content
Chroma database not found in the default location.


In [21]:
from google.colab import userdata
# Retrieve API keys from user data (assuming you've stored them previously)
jina_api_key = userdata.get('JINA_AUTH_TOKEN') # Retrieve the key here
anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')

# Use Colab's secrets management for enhanced security:
# If the keys are not found in userdata, try getting them from environment variables
if jina_api_key is None:
    jina_api_key = os.environ.get('JINA_AUTH_TOKEN')
if anthropic_api_key is None:
    anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY')

# Check if the keys were retrieved successfully
if not jina_api_key:
    raise ValueError("JINA_AUTH_TOKEN not found in user data. Please store it using userdata.set().")

if not anthropic_api_key:
    raise ValueError("ANTHROPIC_API_KEY not found in user data. Please store it using userdata.set().")

In [28]:
persist_directory = '/content/drive/My Drive/my_chroma_db'  # Adjust the path as needed
embedding_db = Chroma.from_texts(
    texts=text_chunks,
    embedding=embeddings_model,
    persist_directory=persist_directory
)

print(f"Chroma database created at: {persist_directory}")

RuntimeError: Your API key has run out of its token-quota. Please top up your key or provide another one with sufficient balance.

In [29]:
from google.colab import userdata
userdata.get('JINA_AUTH_TOKEN')

'jina_72f9f8ee3f13448b8364bb33d4eff2c3T94Eo9ZJg__yo5hOm61FiuABOmNt'

In [30]:
import os
print(os.environ.get('JINA_AUTH_TOKEN'))

jina_72f9f8ee3f13448b8364bb33d4eff2c3T94Eo9ZJg__yo5hOm61FiuABOmNt


In [37]:
from langchain.embeddings import JinaEmbeddings

# Replace 'YOUR_NEW_API_KEY' with your actual key
embeddings_model = JinaEmbeddings(jina_auth_token=jina_api_key)

persist_directory = '/content/drive/My Drive/my_chroma_db'
embedding_db = Chroma.from_texts(
    texts=text_chunks,
    embedding=embeddings_model, # Pass the updated embeddings model
    persist_directory=persist_directory
)

print(f"Chroma database created at: {persist_directory}")

Chroma database created at: /content/drive/My Drive/my_chroma_db


In [26]:
import os
os.environ['JINA_AUTH_TOKEN'] = 'jina_72f9f8ee3f13448b8364bb33d4eff2c3T94Eo9ZJg__yo5hOm61FiuABOmNt'  # Replace with your key
print(os.environ.get('JINA_AUTH_TOKEN'))  # Verify it's set

jina_72f9f8ee3f13448b8364bb33d4eff2c3T94Eo9ZJg__yo5hOm61FiuABOmNt


In [27]:
!export JINA_AUTH_TOKEN='YOUR_ACTUAL_JINA_API_KEY'