In [None]:
!pip install langchain langchain_fireworks langchain_community beautifulsoup4 google-search-results chromadb langchainhub sentence-transformers langchain-chroma gradio aiolimiter lxml faiss-cpu flashrank rank_bm25

Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_fireworks
  Downloading langchain_fireworks-0.1.7-py3-none-any.whl.metadata (4.0 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.20-py3-none-any.whl.metadata (659 bytes)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting gradio
  Downloading gradio-4.41.0-py3-none-any.whl.metadata (15 kB)
Collecting aiolimiter
  Downloading aiolimiter-1.1.0-py3-none-any.whl.metadata (4.5 kB)


In [None]:
import asyncio
import aiohttp
import os
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_fireworks import FireworksEmbeddings, ChatFireworks
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize
import re
import io
import time
import sys
import gradio as gr
from typing import List, Tuple, Any
from langchain_community.vectorstores import FAISS
import numpy as np
from functools import lru_cache
import faiss
import httpx
from urllib.parse import urlparse
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.retrievers import ContextualCompressionRetriever
from flashrank import Ranker, RerankRequest
import math
from rank_bm25 import BM25Okapi
from sklearn.cluster import AgglomerativeClustering
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tree_sitter_languages import get_language, get_parser
from tree_sitter import Language, Parser
import ast

In [None]:
# Set up API clients
os.environ['FIREWORKS_API_KEY'] = 'API_KEY'
os.environ["SERPER_API_KEY"] = 'API_KEY'


# Download NLTK data for sentence tokenization
nltk.download('punkt', quiet=True)

In [None]:


# Initialize components
embeddings = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5")
llm = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)
ranker_nano = Ranker()

async def scrape_webpage(url: str) -> str:
    async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=3.0)) as client:
        try:
            response = await client.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            return ' '.join(soup.stripped_strings)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return ""

def create_sentence_windows(text: str, window_size: int = 3) -> List[str]:
    sentences = text.split('. ')
    windows = []
    for i in range(len(sentences)):
        window = '. '.join(sentences[max(0, i-window_size):min(len(sentences), i+window_size+1)])
        windows.append(window)
    return windows

def semantic_chunking(text: str, max_chunk_size: int = 1024) -> List[str]:
    # Step 1: Split the document into sentences
    splitter = RecursiveCharacterTextSplitter(
        separators=[". ", "? ", "! "],
        chunk_size=max_chunk_size,
        chunk_overlap=0,
    )
    sentences = splitter.split_text(text)

    # Step 2: Index each sentence based on position
    indexed_sentences = list(enumerate(sentences))

    # Step 3: Create embeddings for each sentence
    embeddings = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5")
    sentence_embeddings = embeddings.embed_documents([s[1] for s in indexed_sentences])

    # Step 4: Calculate distances between sentences
    distances = np.zeros((len(sentence_embeddings), len(sentence_embeddings)))
    for i in range(len(sentence_embeddings)):
        for j in range(i+1, len(sentence_embeddings)):
            distance = np.linalg.norm(np.array(sentence_embeddings[i]) - np.array(sentence_embeddings[j]))
            distances[i][j] = distances[j][i] = distance

    # Step 5: Cluster sentences based on similarity
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5)
    clustering.fit(sentence_embeddings)

    # Step 6: Group sentences into chunks based on clustering
    chunks = []
    current_chunk = []
    current_cluster = clustering.labels_[0]

    for i, (index, sentence) in enumerate(indexed_sentences):
        if clustering.labels_[i] != current_cluster:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_cluster = clustering.labels_[i]
        current_chunk.append(sentence)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def ast_based_chunking(code: str, max_chunk_size: int = 1024) -> List[str]:
    # Parse the code into an AST
    tree = ast.parse(code)

    # Function to get the source code for a specific node
    def get_code_for_node(node):
        return ast.get_source_segment(code, node)

    chunks = []
    current_chunk = ""

    def add_to_chunk(node_code):
        nonlocal current_chunk
        if len(current_chunk) + len(node_code) > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = ""
        current_chunk += node_code + "\n\n"

    # Traverse the AST and extract functions, classes, and smaller blocks if necessary
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.Import, ast.ImportFrom)):
            node_code = get_code_for_node(node)

            if len(node_code) > max_chunk_size:
                # If the node itself is too large, split it further
                if isinstance(node, ast.ClassDef) or isinstance(node, ast.FunctionDef):
                    for sub_node in node.body:
                        sub_node_code = get_code_for_node(sub_node)
                        if len(sub_node_code) > max_chunk_size:
                            # Further split large sub-nodes by lines
                            lines = sub_node_code.splitlines(keepends=True)
                            temp_chunk = ""
                            for line in lines:
                                if len(temp_chunk) + len(line) > max_chunk_size:
                                    chunks.append(temp_chunk)
                                    temp_chunk = ""
                                temp_chunk += line
                            if temp_chunk:
                                chunks.append(temp_chunk)
                        else:
                            add_to_chunk(sub_node_code)
                else:
                    # If it's a large import or other statement, split by lines
                    lines = node_code.splitlines(keepends=True)
                    for line in lines:
                        add_to_chunk(line)
            else:
                add_to_chunk(node_code)

    # Add any remaining code to the last chunk
    if current_chunk:
        chunks.append(current_chunk)

    return chunks


def batch_rerank(ranker, query, documents, batch_size=512):
    all_reranked = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        passages = [{"id": j, "text": doc.page_content} for j, doc in enumerate(batch, start=i)]
        rerank_request = RerankRequest(query=query, passages=passages)
        reranked_batch = ranker.rerank(rerank_request)
        all_reranked.extend(reranked_batch)
    return all_reranked

def bm25_rerank(query, documents):
    corpus = [doc.page_content for doc in documents]
    tokenized_corpus = [doc.split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [{"id": i, "text": documents[i].page_content, "score": scores[i]} for i in ranked_indices]

async def process_query(question: str, url: str, num_retrieved: int, num_used: int, rerank_method: str, chunking_method: str):
    print(f"Scraping URL: {url}")
    scraped_text = await scrape_webpage(url)
    print(f"Scraped text length: {len(scraped_text)} characters")

    # Apply the selected chunking method
    if chunking_method == "Sentence Windows":
        chunks = create_sentence_windows(scraped_text)
    elif chunking_method == "Semantic Chunking":
        chunks = semantic_chunking(scraped_text)
    elif chunking_method == "AST-Based Chunking":
        chunks = ast_based_chunking(scraped_text)
    else:
        raise ValueError(f"Unknown chunking method: {chunking_method}")

    print(f"Number of chunks: {len(chunks)}")

    index_documents = [Document(page_content=chunk) for chunk in chunks]

    try:
        print("Creating embeddings...")
        vectorstore = FAISS.from_documents(index_documents, embeddings)
        print("Embeddings created successfully.")

        print(f"Performing similarity search for top {num_retrieved} documents...")
        retrieved_docs = vectorstore.similarity_search(question, k=min(num_retrieved, len(index_documents)))
        print(f"Retrieved {len(retrieved_docs)} documents.")
    except Exception as e:
        print(f"Error during embedding or similarity search: {e}")
        raise

    if rerank_method == "Nano Bi-Encoder":
        print("Using Nano Bi-Encoder reranking")
        reranked_results = batch_rerank(ranker_nano, question, retrieved_docs)
    elif rerank_method == "BM25":
        print("Using BM25 reranking")
        reranked_results = bm25_rerank(question, retrieved_docs)
    else:
        print("No reranking")
        reranked_results = [{"id": i, "text": doc.page_content, "score": 1.0} for i, doc in enumerate(retrieved_docs)]

    reranked_results.sort(key=lambda x: x["score"], reverse=True)
    top_docs = [Document(page_content=result["text"]) for result in reranked_results[:num_used]]

    context = "\n\n".join([doc.page_content for doc in top_docs])

    prompt_template = """
    Use the following context to answer the question. First do a reasoning step and analyze the context. If you cannot answer based on the context, say "I don't have enough information to answer that question."

    Context:
    {context}

    Question: {question}

    Answer:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    rag_chain = prompt | llm | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": question})

    print("Final Answer:\n", answer)

    return answer, top_docs, retrieved_docs

# Gradio interface
def gradio_interface(question: str, url: str, num_retrieved: int, num_used: int, rerank_method: str, chunking_method: str):
    try:
        answer, top_docs, retrieved_docs = asyncio.run(process_query(question, url, num_retrieved, num_used, rerank_method, chunking_method))

        output = f"{'-'*100}\nAnswer: {answer}\n{'-'*100}\n\n"

        output += f"Retrieved documents: {len(retrieved_docs)}\n"
        output += f"Documents used for question answering: {len(top_docs)}\n"
        output += f"Reranking method: {rerank_method}\n"
        output += f"Chunking method: {chunking_method}\n\n"

        output += "Chunks used for question answering:\n"
        for i, doc in enumerate(top_docs, 1):
            output += f"\nChunk {i}:\n{doc.page_content}\n{'-'*50}"
    except Exception as e:
        output = f"An error occurred: {str(e)}"

    return output

# Create Gradio interface with updated inputs
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter your question", value="How can I take care of my eyes?"),
        gr.Textbox(label="Enter the URL to scrape", value="https://medlineplus.gov/eyecare.html"),
        gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Number of retrieved documents"),
        gr.Slider(minimum=1, maximum=100, value=5, step=1, label="Number of documents to use"),
        gr.Radio(["No Reranking", "Nano Bi-Encoder", "BM25"], label="Reranking Method", value="No Reranking"),
        gr.Radio(["Sentence Windows", "Semantic Chunking", "AST-Based Chunking"], label="Chunking Method", value="Sentence Windows")
    ],
    outputs="text",
    title="RAG Query Processing with Options",
    description="Enter a question and a URL. Adjust the number of retrieved documents, the number to use for answering, choose a reranking method, and select a chunking strategy. The system will scrape the URL, process its content, and answer your question based on the scraped information. All used chunks will be displayed.",
    examples=[
        ["How can I take care of my eyes?", "https://medlineplus.gov/eyecare.html", 20, 5, "No Reranking", "Sentence Windows"],
        ["What are the losses in the code?", "https://raw.githubusercontent.com/matsilv/knowledge-injection-dnn/master/models.py", 20, 5, "No Reranking", "AST-Based Chunking"]
    ]
)

if __name__ == "__main__":
    iface.launch(share=True, debug=True)