<a href="https://colab.research.google.com/github/me-ibad/NPL-Project-Task-3/blob/main/retriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install faiss-cpu sentence-transformers PyPDF2

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-tran

In [3]:
# retriever.py

import os
import pickle
from typing import List
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

class Retriever:
    """
    A document retriever using SentenceTransformers for embeddings and FAISS for vector search.
    """
    def __init__(self, model_name='all-MiniLM-L6-v2', chunk_size=300, overlap=50):
        self.model = SentenceTransformer(model_name)
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.index = None
        self.documents = []
        self.embeddings = []

    def _chunk_text(self, text: str) -> List[str]:
        words = text.split()
        chunks = []
        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk = ' '.join(words[i:i+self.chunk_size])
            chunks.append(chunk)
        return chunks

    def add_documents(self, texts: List[str]):
        """
        Add and index documents. Accepts list of strings.
        """
        all_chunks = []
        for text in texts:
            chunks = self._chunk_text(text)
            all_chunks.extend(chunks)
            self.documents.extend(chunks)

        embeddings = self.model.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)
        embeddings = normalize(embeddings)

        self.embeddings = embeddings
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)

    def query(self, query_text: str, top_k=3) -> List[str]:
        """
        Retrieve top_k relevant document chunks for the given query.
        """
        query_vec = self.model.encode([query_text], convert_to_numpy=True)
        query_vec = normalize(query_vec)
        D, I = self.index.search(query_vec, top_k)
        return [self.documents[i] for i in I[0]]

    def save(self, path: str):
        """
        Save index and document chunks.
        """
        faiss.write_index(self.index, f"{path}.index")
        with open(f"{path}.pkl", "wb") as f:
            pickle.dump(self.documents, f)

    def load(self, path: str):
        """
        Load index and documents from disk.
        """
        self.index = faiss.read_index(f"{path}.index")
        with open(f"{path}.pkl", "rb") as f:
            self.documents = pickle.load(f)


In [4]:
import os
from PyPDF2 import PdfReader

def load_text_file(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def load_pdf_file(path):
    reader = PdfReader(path)
    return "\n".join([page.extract_text() or "" for page in reader.pages])

def load_documents_from_folder(folder_path):
    docs = []
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        if fname.endswith(".txt") or fname.endswith(".md"):
            docs.append(load_text_file(fpath))
        elif fname.endswith(".pdf"):
            docs.append(load_pdf_file(fpath))
    return docs


In [6]:
# Initialize
retriever = Retriever(chunk_size=21, overlap=0)

# Load your documents from the current folder
documents = load_documents_from_folder(".")

# Add documents to the retriever
retriever.add_documents(documents)

# Query the retriever
query = "What does python use?"
results = retriever.query(query)

# Show result
print("Query:", query)
print("Top result:", results[0])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query: What does python use?
Top result: is a programming language that lets you work quickly and integrate systems more effectively. It is widely used in web development,


In [None]:
def test_retriever():
    r = Retriever()
    r.add_documents(["Python is a popular programming language."])
    result = r.query("What is Python?")
    assert "Python" in result[0]
    print("✅ Test passed")

test_retriever()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Test passed
