This file aims to retrieve information from a website and determine if we can integrate the data into our LLM to facilitate accurate responses to our queries.

In [2]:
# Importing the Optional type from the typing module for type annotations.  
from typing import Optional  

# Importing the requests library to handle HTTP requests.  
import requests  

# Importing the re module for regular expression operations.  
import re  

# Importing RecursiveCharacterTextSplitter from langchain_text_splitters and aliasing it as TextSplitter.  
# This is used for splitting text into manageable chunks recursively based on character limits.  
from langchain_text_splitters import RecursiveCharacterTextSplitter as TextSplitter  

# Importing the completion module from litellm.  
# This is used for handling language model requests.
from litellm import completion  

# Importing load_dotenv from the dotenv package to load environment variables from a .env file.  
from dotenv import load_dotenv  

# Importing QdrantClient from qdrant_client to interact with the Qdrant vector database.  
from qdrant_client import QdrantClient  

# Importing VectorParams and Distance models from qdrant_client.models.  
# These are used to define vector storage parameters and distance metrics.  
from qdrant_client.models import VectorParams, Distance  

# Importing SentenceTransformer from sentence_transformers to generate sentence embeddings.  
from sentence_transformers import SentenceTransformer  

# Importing Markdown and display from IPython.display to render Markdown in Jupyter notebooks.  
from IPython.display import Markdown, display  

# Importing DDGS from duckduckgo_search to perform web searches using DuckDuckGo's search engine.  
from duckduckgo_search import DDGS  

# Loading environment variables from the .env file into the environment.  
# This allows the use of API keys and other sensitive information without hardcoding them.  
load_dotenv()  

True

In [2]:
def fetch_url(url_body: str) -> Optional[str]:
    """  
    Fetches the content from a specified URL.  

    Args:  
        url_body (str): The suffix of the URL to fetch, appended to the base URL.  

    Returns:  
        Optional[str]: The decoded content of the response if successful; otherwise, None.  
    """  
    
    url_prefix: str = "https://r.jina.ai/"
    url_full : str = url_prefix + url_body 
    
    try:
        response = requests.get(url_full)
        if response.status_code == 200:
            return response.content.decode("utf-8")
        else:
            print(f"Error {response.status_code}: {response.text}")
            return None
    except:
        print(f"Error: Fetching {url_body} failed.")
        return None
    
def clean_text(text):
    """  
    Cleans the input text by removing unnecessary whitespace and line breaks.  

    Args:  
        text (str): The raw text to be cleaned.  

    Returns:  
        str: The cleaned and stripped text.  
    """  
    
    # Replace newline characters with a space  
    text = text.replace("\n", " ")  
    # Replace carriage return characters with a space  
    text = text.replace("\r", " ")  
    # Use a regular expression to replace multiple whitespace characters with a single space  
    text = re.sub(r"\s+", " ", text)  
    # Remove leading and trailing whitespace  
    text = text.strip()  
    
    return text
    
def get_embeddings(texts, model_name="all-MiniLM-L6-v2"):  
    """  
    Generates embeddings for a list of texts using a specified SentenceTransformer model.  

    Args:  
        texts (list): A list of strings for which embeddings are to be generated.  
        model_name (str, optional): The name of the pre-trained SentenceTransformer model to use. Defaults to "all-MiniLM-L6-v2".  

    Returns:  
        list: A list of embeddings corresponding to the input texts.  
    """  
    
    model = SentenceTransformer(model_name)   
    embeddings = model.encode(texts)  
    
    return embeddings  

def search(text: str, top_k: int, client, collection_name):
    """  
    Searches for the top_k most similar vectors in the Qdrant collection based on the input text.  

    Args:  
        text (str): The query text to search for.  
        top_k (int): The number of top results to retrieve.  
        client (QdrantClient): An instance of QdrantClient to interact with the Qdrant service.  
        collection_name (str): The name of the Qdrant collection to search within.  

    Returns:  
        list: A list of search results containing the most similar documents.  
    """  
    
    query_embedding = get_embeddings(text)
    
    result = client.search(
        collection_name=collection_name,
        query_vector= query_embedding,
        query_filter=None,
        limit=top_k
    )
    
    return result

def format_docs(docs):
    """  
    Formats a list of documents by concatenating their 'content' payloads, separated by double newlines.  

    Args:  
        docs (list): A list of document objects retrieved from Qdrant.  

    Returns:  
        str: A single string containing the concatenated contents of all documents.  
    """  
    
    return "\n\n".join(doc.payload["content"] for doc in docs)
    
def format_search_results(results):
    """  
    Formats search results retrieved from DuckDuckGo by concatenating their 'body' fields, separated by double newlines.  

    Args:  
        results (list): A list of search result objects from DuckDuckGo.  

    Returns:  
        str: A single string containing the concatenated bodies of all search results.  
    """  
    
    return "\n\n".join(doc["body"] for doc in results)

def answer(question, client, collection_name):
    """  
    Provides an answer to a given question using context retrieved from Qdrant. If the context is insufficient, it performs an online search.  

    Args:  
        question (str): The user's question to be answered.  
        client (QdrantClient): An instance of QdrantClient to interact with the Qdrant service.  
        collection_name (str): The name of the Qdrant collection to search within.  
    """  
    
    results = search(question, top_k=3, client=client, collection_name=collection_name)
    context = format_docs(results)

    system_prompt_1 = """
    Your task is to determine if a specific question can be answered using the provided context. 
    If so, return 1; otherwise, return 0. 
    Do not return anything other than 1 or 0. 

    Context:  {context}
    """

    system_prompt_2 = """
    You are an expert for answering questions. Answer the question according only to the given context.
    If question cannot be answered using the context, simply say I don't know. Do not make stuff up.
    Your answer MUST be informative, concise, and action driven. Your response must be in Markdown.

    Context: {context}
    """

    user_prompt = """
    Question: {question}

    Answer: 
    """
    
    response = completion(
        model="gpt-4o-mini",
        messages=[{"content": system_prompt_1.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}],
        max_tokens=500,
        # format="json"
        )

    has_answer = response.choices[0].message.content
    

    print(f"Question: {question}")
    if has_answer == '1':
        print("Context can answer the question")
        response = completion(
            model="gpt-4o-mini",
            messages=[{"content": system_prompt_2.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}],
            max_tokens=500
        )
        print("Answer:")
        display(Markdown(response.choices[0].message.content))
    else:
        print("Context is NOT relevant. Searching online...")
        results = DDGS().text(question, max_results=5)
        context = format_search_results(results)
        print("Found online sources. Generating the response...")
        response = completion(
            model="gpt-4o-mini",
            messages=[{"content": system_prompt_2.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}],
            max_tokens=500
        )
        print("Answer:")
        display(Markdown(response.choices[0].message.content))

In [3]:
url_body: str = "https://em360tech.com/tech-article/meta-gdpr-fine"
content : Optional[str] = fetch_url(url_body)

texts = []
metadatas = []
texts.append(content)
metadatas.append({"url": url_body})
# Ensure that the lengths of texts and metadatas lists are equal.
# This is crucial to maintain one-to-one correspondence between texts and their metadata.
assert len(metadatas) == len(texts)

# Initialize a text splitter using the RecursiveCharacterTextSplitter from LangChain.  
# - model_name: Specifies the model to be used for tokenization (here, "gpt-4")  
# - chunk_size: The maximum size of each text chunk (150 tokens)  
# - chunk_overlap: The number of overlapping tokens between consecutive chunks (0 here, meaning no overlap)  
text_splitter = TextSplitter.from_tiktoken_encoder(model_name="gpt-4", chunk_size=150, chunk_overlap=0)
text_chunks = text_splitter.split_text(content)
print(f"Total number of chunks: {len(text_chunks)}")

Total number of chunks: 54


In [4]:
embeddings = get_embeddings(text_chunks)
assert len(embeddings) == len(text_chunks)



In [5]:
client = QdrantClient("http://localhost:6333")
collection_name = "agent_rag_index"
# The size of vectors need to be fixed based on the size of embedings. 
VECTOR_SIZE = 384

client.delete_collection(collection_name=collection_name)
client.create_collection(collection_name=collection_name, vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE))

ids = []
payload = []

for id, text in enumerate(text_chunks):
    ids.append(id)
    payload.append({"url": url_body, "content": text})

client.upload_collection(
    collection_name=collection_name,
    vectors=embeddings,
    payload=payload,
    ids=ids,
    batch_size=256
)

client.count(collection_name=collection_name)

CountResult(count=54)

In [8]:
question = "Did Apple slam for GDPR violation?"

In [9]:
answer(question, client, collection_name)

Question: Did Apple slam for GDPR violation?
Context is NOT relevant. Searching online...
Found online sources. Generating the response...
Answer:


I don't know.