In [None]:
!pip install langchain langchain_fireworks langchain_community beautifulsoup4 google-search-results chromadb langchainhub sentence-transformers langchain-chroma gradio aiolimiter lxml faiss-cpu flashrank rank_bm25

In [3]:
import asyncio
import aiohttp
import os
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_fireworks import FireworksEmbeddings, ChatFireworks
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize
import re
import io
import time
import sys
import gradio as gr
import asyncio
from typing import List, Tuple, Any
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import numpy as np
from functools import lru_cache
import faiss
import httpx
from urllib.parse import urlparse
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.retrievers import ContextualCompressionRetriever
from flashrank import Ranker, RerankRequest
import math
from rank_bm25 import BM25Okapi

In [4]:
# Set up API clients
os.environ['FIREWORKS_API_KEY'] = 'API_KEY'
os.environ["SERPER_API_KEY"] = 'API_KEY'


# Download NLTK data for sentence tokenization
nltk.download('punkt', quiet=True)

True

In [6]:
# Initialize components
embeddings = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5")
llm = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)
ranker_nano = Ranker()

async def scrape_webpage(url: str) -> str:
    async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=3.0)) as client:
        try:
            response = await client.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            return ' '.join(soup.stripped_strings)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return ""

def create_sentence_windows(text: str, window_size: int = 3) -> List[str]:
    sentences = text.split('. ')
    windows = []
    for i in range(len(sentences)):
        window = '. '.join(sentences[max(0, i-window_size):min(len(sentences), i+window_size+1)])
        windows.append(window)
    return windows

def batch_rerank(ranker, query, documents, batch_size=512):
    all_reranked = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        passages = [{"id": j, "text": doc.page_content} for j, doc in enumerate(batch, start=i)]
        rerank_request = RerankRequest(query=query, passages=passages)
        reranked_batch = ranker.rerank(rerank_request)
        all_reranked.extend(reranked_batch)
    return all_reranked

def bm25_rerank(query, documents):
    corpus = [doc.page_content for doc in documents]
    tokenized_corpus = [doc.split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [{"id": i, "text": documents[i].page_content, "score": scores[i]} for i in ranked_indices]

async def process_query(question: str, url: str, num_retrieved: int, num_used: int, rerank_method: str):
    print(f"Scraping URL: {url}")
    scraped_text = await scrape_webpage(url)
    print(f"Scraped text length: {len(scraped_text)} characters")

    sentence_windows = create_sentence_windows(scraped_text)
    print(f"Number of sentence windows: {len(sentence_windows)}")

    index_documents = [Document(page_content=window) for window in sentence_windows]

    try:
        print("Creating embeddings...")
        vectorstore = FAISS.from_documents(index_documents, embeddings)
        print("Embeddings created successfully.")

        print(f"Performing similarity search for top {num_retrieved} documents...")
        retrieved_docs = vectorstore.similarity_search(question, k=min(num_retrieved, len(index_documents)))
        print(f"Retrieved {len(retrieved_docs)} documents.")
    except Exception as e:
        print(f"Error during embedding or similarity search: {e}")
        raise

    if rerank_method == "Nano":
        print("Using Nano reranking")
        reranked_results = batch_rerank(ranker_nano, question, retrieved_docs)
    elif rerank_method == "BM25":
        print("Using BM25 reranking")
        reranked_results = bm25_rerank(question, retrieved_docs)
    else:
        print("No reranking")
        reranked_results = [{"id": i, "text": doc.page_content, "score": 1.0} for i, doc in enumerate(retrieved_docs)]

    reranked_results.sort(key=lambda x: x["score"], reverse=True)
    top_docs = [Document(page_content=result["text"]) for result in reranked_results[:num_used]]

    context = "\n\n".join([doc.page_content for doc in top_docs])

    prompt_template = """
    Use the following context to answer the question. First do a reasoning step and analize the context. If you cannot answer based on the context, say "I don't have enough information to answer that question."

    Context:
    {context}

    Question: {question}

    Answer:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    rag_chain = prompt | llm | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": question})

    print("Final Answer:\n", answer)

    return answer, top_docs, retrieved_docs

# Gradio interface
def gradio_interface(question: str, url: str, num_retrieved: int, num_used: int, rerank_method: str):
    try:
        answer, top_docs, retrieved_docs = asyncio.run(process_query(question, url, num_retrieved, num_used, rerank_method))

        output = f"{'-'*100}\nAnswer: {answer}\n{'-'*100}\n\n"

        output += f"Retrieved documents: {len(retrieved_docs)}\n"
        output += f"Documents used for question answering: {len(top_docs)}\n"
        output += f"Reranking method: {rerank_method}\n\n"

        output += "Chunks used for question answering:\n"
        for i, doc in enumerate(top_docs, 1):
            output += f"\nChunk {i}:\n{doc.page_content}\n{'-'*50}"
    except Exception as e:
        output = f"An error occurred: {str(e)}"

    return output

# Create Gradio interface with updated inputs
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter your question", value="How can I take care of my eyes?"),
        gr.Textbox(label="Enter the URL to scrape", value="https://medlineplus.gov/eyecare.html"),
        gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Number of retrieved documents"),
        gr.Slider(minimum=1, maximum=100, value=5, step=1, label="Number of documents to use"),
        gr.Radio(["No Reranking", "Nano", "BM25"], label="Reranking Method", value="No Reranking")
    ],
    outputs="text",
    title="RAG Query Processing with Options",
    description="Enter a question and a URL. Adjust the number of retrieved documents and the number to use for answering. Choose a reranking method. The system will scrape the URL, process its content, and answer your question based on the scraped information. All used chunks will be displayed."
)

if __name__ == "__main__":
    iface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://7bfce9ac0bdade813a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Scraping URL: https://medlineplus.gov/eyecare.html
Scraped text length: 9507 characters
Number of sentence windows: 69
Creating embeddings...
Embeddings created successfully.
Performing similarity search for top 20 documents...
Retrieved 20 documents.
No reranking
Final Answer:
 Based on the context, here's a reasoning step and analysis:

The context provides several tips for taking care of one's eyes, including:

1. Eating a healthy, balanced diet that includes plenty of fruits and vegetables, especially deep yellow and green leafy vegetables.
2. Eating fish high in omega-3 fatty acids, such as salmon, tuna, and halibut.
3. Maintaining a healthy weight.
4. Getting regular exercise.
5. Getting regular eye check-ups as recommended by a healthcare provider, or if you have any new vision problems.

Considering these tips, the answer to the question "How can I take care of my eyes?" is:

**To take care of your eyes, eat a healthy, balanced diet, maintain a healthy weight, get regular exerc

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize components
embeddings = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5")
llm = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)
ranker_nano = Ranker()

# Load BERT model for semantic chunking
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

async def scrape_webpage(url: str) -> str:
    async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=3.0)) as client:
        try:
            response = await client.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            return ' '.join(soup.stripped_strings)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return ""

# Prose chunking methods
def create_sentence_windows(text: str, window_size: int = 3) -> List[str]:
    sentences = text.split('. ')
    windows = []
    for i in range(len(sentences)):
        window = '. '.join(sentences[max(0, i-window_size):min(len(sentences), i+window_size+1)])
        windows.append(window)
    return windows

def semantic_chunking(text: str, max_chunk_size: int = 512) -> List[str]:
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        tokens = tokenizer.encode(sentence, add_special_tokens=False)
        if current_length + len(tokens) > max_chunk_size and current_chunk:
            chunks.append('. '.join(current_chunk) + '.')
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += len(tokens)

    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')

    return chunks

# Code chunking methods
def code_block_chunking(text: str, max_lines: int = 50) -> List[str]:
    lines = text.split('\n')
    chunks = []
    current_chunk = []

    for line in lines:
        if len(current_chunk) >= max_lines:
            chunks.append('\n'.join(current_chunk))
            current_chunk = []
        current_chunk.append(line)

    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks

def function_based_chunking(text: str) -> List[str]:
    # This is a simple implementation and might need to be adapted for different programming languages
    function_pattern = re.compile(r'(def\s+\w+\s*\([^)]*\):.*?(?=\n\S|\Z))', re.DOTALL)
    chunks = function_pattern.findall(text)
    return chunks

# Code and prose chunking methods
def code_and_text_separation(text: str) -> List[str]:
    # Simple implementation, assuming code blocks are indented
    lines = text.split('\n')
    chunks = []
    current_chunk = []
    in_code_block = False

    for line in lines:
        if line.strip().startswith('```') or (not in_code_block and line.startswith('    ')):
            if current_chunk:
                chunks.append('\n'.join(current_chunk))
                current_chunk = []
            in_code_block = not in_code_block
        current_chunk.append(line)

    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks

def preserve_context(text: str, context_lines: int = 2) -> List[str]:
    lines = text.split('\n')
    chunks = []
    current_chunk = []
    in_code_block = False

    for i, line in enumerate(lines):
        if line.strip().startswith('```'):
            in_code_block = not in_code_block
            if not in_code_block and current_chunk:
                context_start = max(0, i - context_lines)
                context_end = min(len(lines), i + context_lines + 1)
                chunks.append('\n'.join(lines[context_start:context_end]))
                current_chunk = []
        elif in_code_block:
            current_chunk.append(line)

    return chunks

def batch_rerank(ranker, query, documents, batch_size=512):
    all_reranked = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        passages = [{"id": j, "text": doc.page_content} for j, doc in enumerate(batch, start=i)]
        rerank_request = RerankRequest(query=query, passages=passages)
        reranked_batch = ranker.rerank(rerank_request)
        all_reranked.extend(reranked_batch)
    return all_reranked

def bm25_rerank(query, documents):
    corpus = [doc.page_content for doc in documents]
    tokenized_corpus = [doc.split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [{"id": i, "text": documents[i].page_content, "score": scores[i]} for i in ranked_indices]

async def process_query(question: str, url: str, num_retrieved: int, num_used: int, rerank_method: str, chunking_method: str):
    print(f"Scraping URL: {url}")
    scraped_text = await scrape_webpage(url)
    print(f"Scraped text length: {len(scraped_text)} characters")

    # Apply the selected chunking method
    if chunking_method == "Sentence Windows":
        chunks = create_sentence_windows(scraped_text)
    elif chunking_method == "Semantic Chunking":
        chunks = semantic_chunking(scraped_text)
    elif chunking_method == "Code Block Chunking":
        chunks = code_block_chunking(scraped_text)
    elif chunking_method == "Function-Based Chunking":
        chunks = function_based_chunking(scraped_text)
    elif chunking_method == "Code & Text Separation":
        chunks = code_and_text_separation(scraped_text)
    elif chunking_method == "Preserve Context":
        chunks = preserve_context(scraped_text)
    else:
        raise ValueError(f"Unknown chunking method: {chunking_method}")

    print(f"Number of chunks: {len(chunks)}")

    index_documents = [Document(page_content=chunk) for chunk in chunks]

    try:
        print("Creating embeddings...")
        vectorstore = FAISS.from_documents(index_documents, embeddings)
        print("Embeddings created successfully.")

        print(f"Performing similarity search for top {num_retrieved} documents...")
        retrieved_docs = vectorstore.similarity_search(question, k=min(num_retrieved, len(index_documents)))
        print(f"Retrieved {len(retrieved_docs)} documents.")
    except Exception as e:
        print(f"Error during embedding or similarity search: {e}")
        raise

    if rerank_method == "Nano":
        print("Using Nano reranking")
        reranked_results = batch_rerank(ranker_nano, question, retrieved_docs)
    elif rerank_method == "BM25":
        print("Using BM25 reranking")
        reranked_results = bm25_rerank(question, retrieved_docs)
    else:
        print("No reranking")
        reranked_results = [{"id": i, "text": doc.page_content, "score": 1.0} for i, doc in enumerate(retrieved_docs)]

    reranked_results.sort(key=lambda x: x["score"], reverse=True)
    top_docs = [Document(page_content=result["text"]) for result in reranked_results[:num_used]]

    context = "\n\n".join([doc.page_content for doc in top_docs])

    prompt_template = """
    Use the following context to answer the question. If you cannot answer based on the context, say "I don't have enough information to answer that question."

    Context:
    {context}

    Question: {question}

    Answer:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    rag_chain = prompt | llm | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": question})

    print("Final Answer:\n", answer)

    return answer, top_docs, retrieved_docs

# Gradio interface
import gradio as gr

def gradio_interface(question: str, url: str, num_retrieved: int, num_used: int, rerank_method: str, chunking_method: str):
    try:
        answer, top_docs, retrieved_docs = asyncio.run(process_query(question, url, num_retrieved, num_used, rerank_method, chunking_method))

        output = f"{'-'*100}\nAnswer: {answer}\n{'-'*100}\n\n"

        output += f"Retrieved documents: {len(retrieved_docs)}\n"
        output += f"Documents used for question answering: {len(top_docs)}\n"
        output += f"Reranking method: {rerank_method}\n"
        output += f"Chunking method: {chunking_method}\n\n"

        output += "Chunks used for question answering:\n"
        for i, doc in enumerate(top_docs, 1):
            output += f"\nChunk {i}:\n{doc.page_content}\n{'-'*50}"
    except Exception as e:
        output = f"An error occurred: {str(e)}"

    return output

# Create Gradio interface with updated inputs
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter your question", value="How can I take care of my eyes?"),
        gr.Textbox(label="Enter the URL to scrape", value="https://medlineplus.gov/eyecare.html"),
        gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Number of retrieved documents"),
        gr.Slider(minimum=1, maximum=100, value=5, step=1, label="Number of documents to use"),
        gr.Radio(["No Reranking", "Nano", "BM25"], label="Reranking Method", value="No Reranking"),
        gr.Radio([
            "Sentence Windows", "Semantic Chunking",  # Prose options
            "Code Block Chunking", "Function-Based Chunking",  # Code options
            "Code & Text Separation", "Preserve Context"  # Code and prose options
        ], label="Chunking Method", value="Sentence Windows")
    ],
    outputs="text",
    title="RAG Query Processing with Advanced Options",
    description="Enter a question and a URL. Adjust the number of retrieved documents, the number to use for answering, choose a reranking method, and select a chunking strategy. The system will scrape the URL, process its content, and answer your question based on the scraped information. All used chunks will be displayed."
)

if __name__ == "__main__":
    iface.launch(share=True, debug=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://99cff9edacc38c4850.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Scraping URL: https://medlineplus.gov/eyecare.html
Scraped text length: 9507 characters
Number of chunks: 69
Creating embeddings...
Embeddings created successfully.
Performing similarity search for top 20 documents...
Retrieved 20 documents.
No reranking
Final Answer:
 According to the context, you can take care of your eyes by:

* Eating a healthy, balanced diet that includes plenty of fruits and vegetables, especially deep yellow and green leafy vegetables, and fish high in omega-3 fatty acids such as salmon, tuna, and halibut.
* Maintaining a healthy weight.
* Getting regular exercise.
* Getting your eyes checked as often as your health care provider recommends it, or if you have any new vision problems.

By following these tips, you can help keep your eyes healthy and make sure you are seeing your best.
Scraping URL: https://medlineplus.gov/eyecare.html
Scraped text length: 9507 characters
Number of chunks: 5
Creating embeddings...
Embeddings created successfully.
Performing simila