# Semantic Search & RAG with LlamaIndex
## ABB #5 - Session 3

Code authored by: Shaw Talebi

### imports

In [None]:
from IPython.display import display, Markdown
from bs4 import BeautifulSoup

from llama_index.core import VectorStoreIndex, get_response_synthesizer, Settings
from llama_index.core.schema import TextNode
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from typing import Optional, Dict, Any



In [None]:
from dotenv import load_dotenv
import os

# import sk from .env file
load_dotenv()
#my_sk = os.getenv("OPENAI_API_KEY_PERSONAL")

In [None]:
from typing import List, Dict
import requests
import os
from bs4 import BeautifulSoup

def fetch_page_text(url: str) -> str:
    """Fetch and extract visible text from a web page."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # Remove script and style elements
        for script_or_style in soup(["script", "style", "noscript"]):
            script_or_style.decompose()
        # Get text and clean up whitespace
        text = soup.get_text(separator=" ", strip=True)
        # Remove excessive whitespace
        text = " ".join(text.split())
        return text
    except Exception as e:
        return f"Error fetching {url}: {e}"

def serpapi_search_and_scrape(query: str, num_results: int = 2) -> List[Dict[str, str]]:
    """
    Perform a Google search using SerpAPI, get top result links,
    fetch and return their page text.
    Returns a list of dicts: [{"url": ..., "text": ...}, ...]
    """
    api_key = os.environ.get("SERP_API_KEY")
    if not api_key:
        raise ValueError("SERP_API_KEY environment variable not set.")
    params = {
        "q": query,
        "api_key": api_key,
        "engine": "google",
        "num": num_results,
    }
    response = requests.get("https://serpapi.com/search", params=params)
    response.raise_for_status()
    data = response.json()
    results = []
    organic_results = data.get("organic_results", [])[:num_results]
    for result in organic_results:
        link = result.get("link")
        if link:
            page_text = fetch_page_text(link)
            results.append({"url": link, "text": page_text})
    return results

def test_serpapi_search_and_scrape():
    results = serpapi_search_and_scrape("What is fine-tuning?")
    for idx, res in enumerate(results):
        print(f"Result {idx+1}: {res['url']}\nText (first 500 chars):\n{res['text'][:500]}\n{'-'*60}")

test_serpapi_search_and_scrape()

### 1) chunk articles

In [None]:
# Get all HTML files from raw directory
filename_list = ["articles/"+f for f in os.listdir('articles')]

chunk_list = []
for filename in filename_list:
    # only process .html files
    if filename.lower().endswith(('.html')):
        # read html file
        with open(filename, 'r', encoding='utf-8') as file:
            html_content = file.read()
    
        # Parse HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Get article title
        article_title = soup.find('title').get_text().strip() if soup.find('title') else "Untitled"
        
        # Initialize variables
        article_content = []
        current_section = "Main"  # Default section if no headers found
        
        # Find all headers and text content
        content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol'])
    
        # iterate through elements and extract text with metadata
        for element in content_elements:
            if element.name in ['h1', 'h2', 'h3']:
                current_section = element.get_text().strip()
            elif element.name in ['p', 'ul', 'ol']:
                text = element.get_text().strip()
                # Only add non-empty content that's at least 30 characters long
                if text and len(text) >= 30:
                    article_content.append({
                        'article_title': article_title,
                        'section': current_section,
                        'text': text
                    })
    
        # add article content to list
        chunk_list.extend(article_content)

In [None]:
# create nodes with Llama Index (i.e. nodes)
node_list = []
for i, chunk in enumerate(chunk_list):
    node_list.append(
        TextNode(
            id_=str(i), 
            text=chunk["text"], 
            metadata = {
                "article":chunk["article_title"],
                "section":chunk["section"]
            }
        )
    )

print(len(node_list))

### 2) create index

In [None]:
# This uses OpenAI API key which is rate limited whn I run it, so commented out to use HuggingFace embedding which is free (see below)
#index = VectorStoreIndex(node_list)
#print(f"Embedding Model: {index._embed_model.model_name}")
#print(f"Index Size: {len(index.vector_store.data.embedding_dict)}")
#print(f"Embedding Size: {len(index.vector_store.data.embedding_dict["0"])}")

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# changing embedding model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [None]:
index = VectorStoreIndex(node_list)

print(f"Embedding Model: {index._embed_model.model_name}")
print(f"Index Size: {len(index.vector_store.data.embedding_dict)}")
print(f"Embedding Size: {len(index.vector_store.data.embedding_dict["0"])}")

### 3) semantic search

In [None]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

In [None]:
results = retriever.retrieve("What is the main topic of the article?")

In [None]:
results[0]

In [None]:
results = retriever.retrieve("When do I perform fine-tuning?")

In [None]:
results[0]

In [None]:
# format results in markdown
results_markdown = ""
for i, result in enumerate(results, start=1):
    results_markdown += f"{i}. **Article title:** {result.metadata["article"]}  \n"
    results_markdown += f"   **Section:** {result.metadata["section"]}  \n"
    results_markdown += f"   **Snippet:** {result.text} \n\n"
    results_markdown += f"   **Score:** {result.score} \n\n"

In [None]:
display(Markdown(results_markdown))

#### Change the default model to use ollama
* Ensure ``ollama serve`` works before using the local open source ollama model

* ``ollama list`` to see which local models are available. Thenpick one from the list as shown in the code below

In [None]:
from llama_index.llms.ollama import Ollama

# Switch to a local LLM (Ollama, e.g., llama3) to avoid OpenAI rate limits
Settings.llm = Ollama(model="llama3.2:latest")

### 4) RAG

In [None]:
# This code demonstrates how to perform a Retrieval-Augmented Generation (RAG) query using only the data that has been previously indexed (i.e., the Nodes from your document collection).
# 
# 1. The `get_response_synthesizer()` function configures how the language model will synthesize answers, but it does not provide any new data to the model.
# 2. `index.as_query_engine(response_synthesizer=response_synthesizer)` creates a query engine that is strictly limited to the indexed Nodes—these are the chunks of your ingested articles.
# 3. When you call `query_engine.query("What is the main topic of the article?")`, the engine retrieves relevant Nodes from the index and passes their content to the LLM for answer synthesis.
# 4. Importantly, the LLM does not have access to any external data, web search, or its own training corpus for factual retrieval. It can only generate answers based on the text of the indexed Nodes.
# 5. The result printed is therefore grounded solely in the content of your indexed articles, ensuring that the response is limited to your data and not influenced by information outside your corpus.

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# create a query engine that only uses the indexed Nodes (no external or model-intrinsic data)
query_engine = index.as_query_engine(response_synthesizer=response_synthesizer)

# query the index: the response is generated strictly from the indexed Nodes
test_response = query_engine.query('When do I perform fine-tuning?') #("What is the main topic of the article?")
print(test_response)

In [None]:
response_synthesizer.get_prompts()

In [None]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [25]:
def synthesize_final_answer(rag_answer: str, serp_snippets: List[str], query: str, llm) -> str:
    """Combine RAG and Google search results, then synthesize a final answer using the LLM."""
    context = (
        f"RAG answer (from indexed documents):\n{rag_answer}\n\n"
        f"Google Search Results:\n"
        + "\n".join(f"- {s}" for s in serp_snippets)
    )
    prompt = (
        f"Given the following question:\n{query}\n\n"
        f"Here is an answer based on a private document collection, and some recent Google search results.\n"
        f"Document-based answer:\n{rag_answer}\n\n"
        f"Google search snippets:\n"
        + "\n".join(f"- {s}" for s in serp_snippets)
        + "\n\n"
        "Please synthesize a final, concise, and accurate answer to the question, using both sources. "
        "If the sources disagree, explain the difference."
    )
    # Use the LLM to synthesize the final answer
    return llm.complete(prompt).text

def augmented_rag_query(
    query: str,
    rag_query_engine,
    llm,
    serpapi_num_results: int = 3
) -> str:
    """Perform RAG, augment with Google search, and synthesize a final answer."""
    # Step 1: RAG answer
    rag_response = rag_query_engine.query(query)
    rag_answer = str(rag_response)
    # Step 2: Google search
    serp_snippets = serpapi_search_and_scrape(query, num_results=serpapi_num_results)
    serp_snippets_text_only = [snippet["text"] for snippet in serp_snippets]
    # Step 3: Synthesize final answer
    return synthesize_final_answer(rag_answer, serp_snippets_text_only, query, llm)

# Example usage:
if __name__ == "__main__":
    test_query = "When do I perform fine-tuning?"
    try:
        final_answer = augmented_rag_query(
            test_query,
            query_engine,
            Settings.llm,
            serpapi_num_results=3
        )
        print("Augmented RAG + Google Search Answer:\n", final_answer)
    except Exception as e:
        print("Error during augmented RAG query:", e)

# Test: Check that the function returns a string and includes both RAG and Google content
def test_augmented_rag_query_integration():
    test_query = "What is Retrieval-Augmented Generation?"
    try:
        answer = augmented_rag_query(test_query, query_engine, Settings.llm, serpapi_num_results=2)
        assert isinstance(answer, str)
        assert len(answer) > 0
        print("Test passed: augmented_rag_query returns a non-empty string.")
    except Exception as e:
        print("Test failed:", e)

test_augmented_rag_query_integration()


Error during augmented RAG query: timed out
Test failed: timed out


In [None]:
response = query_engine.query("When do I perform fine-tuning?")
print(response)

In [None]:
print(f"LLM: {Settings.llm.model}")

In [None]:
from llama_index.llms.openai import OpenAI

# changing the global LLM
Settings.llm = OpenAI("gpt-4o")

In [None]:
# simpler way to make query engine
query_engine = index.as_query_engine()
response = query_engine.query("When do I perform fine-tuning?")
print(response)

In [None]:
print(f"LLM: {Settings.llm.model}")