# (RAG) system built from scratch using Sentence-Transformers, FAISS, Cross-Encoder reranking, and an open-source LLM

## Install Dependencies & Imports & Setup

In [None]:
!pip install pypdf
!pip install sentence-transformers
!pip install transformers
!pip install faiss-cpu
!pip install torch


In [14]:
import os
import json
import time
import faiss
import numpy as np
import pandas as pd

from pathlib import Path
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from transformers import pipeline

## Load & Read Any File Type

In [15]:

def load_documents(file_path: str):
    """
    Loads the content of .txt, .csv, .jsonl, or .pdf files and returns a list of text documents.
    Perfect for building a RAG corpus.
    """
    file_path = Path(file_path)
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    ext = file_path.suffix.lower()

    # TXT
    if ext == ".txt":
        return [file_path.read_text(encoding="utf-8")]

    # CSV (first column = content)
    elif ext == ".csv":
        df = pd.read_csv(file_path)
        return df.iloc[:, 0].astype(str).tolist()

    # JSONL (each line = {"text": "..."} )
    elif ext == ".jsonl":
        docs = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                docs.append(json.loads(line).get("text", ""))
        return docs

    # PDF
    elif ext == ".pdf":
        reader = PdfReader(str(file_path))
        pages = [page.extract_text() for page in reader.pages]
        return [p for p in pages if p]   # remove empty pages

    else:
        raise ValueError(f"Unsupported file format: {ext}")


In [17]:
# Load our Corpus

file_path = "/content/Big_Data.pdf"  # source pdf that will be use as a RAG knowledge base
corpus_texts = load_documents(file_path)

print(f"Loaded {len(corpus_texts)} documents from {file_path}")

Loaded 43 documents from /content/Big_Data.pdf


## Embedding Model
We use `all-MiniLM-L6-v2`, a fast and lightweight embedding model ideal for retrieval

In [None]:
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [19]:
# Convert all documents into dense vectors and normalize them for FAISS retrieval (Embeddings).

print("\n Embedding corpus...")

corpus_embeddings = embed_model.encode(
    corpus_texts,
    batch_size=32,
    convert_to_numpy=True,
    show_progress_bar=True
)

# FAISS requires normalized vectors for cosine similarity
faiss.normalize_L2(corpus_embeddings)

dim = corpus_embeddings.shape[1]
print(f"Vector dimension: {dim}")



 Embedding corpus...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Vector dimension: 384


## Build FAISS Index


In [20]:
print("\n Building FAISS index...")

index = faiss.IndexFlatIP(dim)
index.add(corpus_embeddings)

print(f" FAISS index built with {index.ntotal} vectors.")


 Building FAISS index...
 FAISS index built with 43 vectors.


## Ask a Query → Retrieve Top-K


In [21]:
query = "What are the charactristcs of big data? "
print("\n Your query:", query)

# Embed query
q_vec = embed_model.encode(query, convert_to_numpy=True).reshape(1, -1)
faiss.normalize_L2(q_vec)

# FAISS retrieve
scores, idx = index.search(q_vec, k=5)

retrieved_docs = []
for rank, doc_i in enumerate(idx[0]):
    retrieved_docs.append({
        "rank": rank + 1,
        "text": corpus_texts[doc_i]
    })

print("\nTop Retrieved Docs:")
display(pd.DataFrame(retrieved_docs))


 Your query: What are the charactristcs of big data? 

Top Retrieved Docs:


Unnamed: 0,rank,text
0,1,TYPES OF BIG-DATA\n
1,2,THE CHARACTERISTICS OF BIG DATA\n
2,3,WHAT IS BIG DATA\n• As Gartner defines it – “B...
3,4,THE CHARACTERISTICS OF BIG DATA\nVariety\nAs D...
4,5,TYPES OF BIG-DATA\nBig Data is generally categ...


##  Rerank with Cross Encoder (Better Precision)


In [22]:
print("\n Reranking using CrossEncoder...")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

pairs = [[query, d["text"]] for d in retrieved_docs]
rerank_scores = reranker.predict(pairs)

# Attach rerank score
for i, score in enumerate(rerank_scores):
    retrieved_docs[i]["rerank_score"] = float(score)

# Sort by cross-encoder
reranked = sorted(retrieved_docs, key=lambda x: x["rerank_score"], reverse=True)

print("\n Top Reranked Docs:")
display(pd.DataFrame(reranked))



 Reranking using CrossEncoder...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]


 Top Reranked Docs:


Unnamed: 0,rank,text,rerank_score
0,3,WHAT IS BIG DATA\n• As Gartner defines it – “B...,1.143552
1,5,TYPES OF BIG-DATA\nBig Data is generally categ...,-1.640948
2,4,THE CHARACTERISTICS OF BIG DATA\nVariety\nAs D...,-1.946928
3,2,THE CHARACTERISTICS OF BIG DATA\n,-3.349317
4,1,TYPES OF BIG-DATA\n,-3.922735


## Build Final RAG Prompt
Assemble the top retrieved documents and generate a final instruction prompt for the LLM.


In [23]:
top_docs = reranked[:3]  # use top-3 for LLM

context = "\n\n".join(
    [f"[Doc {i+1}] {d['text']}" for i, d in enumerate(top_docs)]
)

prompt = f"""
Use ONLY the following evidence:

{context}

Question: {query}

Answer clearly, scientifically, and with accurate facts.
"""

print("\n Final Prompt Sent to LLM:\n")
print(prompt[:600], "...")


 Final Prompt Sent to LLM:


Use ONLY the following evidence:

[Doc 1] WHAT IS BIG DATA
• As Gartner defines it – “Big Data are high volume, high velocity, or high-variety information assets 
that require new forms of processing to enable enhanced decision making, insight discovery, and 
process optimization.”
• The term ‘big data’ is self-explanatory − a collection of huge data sets that normal computing 
techniques cannot process. 
• The term not only refers to the data, but also to the various frameworks, tools, and techniques 
involved. 
• Technological advancement and the advent of new channels of communication (lik ...


## Generate Final Answer

In [25]:
# Use Phi-3.5-mini-instruct (fast, small, reliable) to produce the final RAG answer

print("\n Generating answer...")

generator = pipeline(
    "text-generation",
    model="microsoft/Phi-3.5-mini-instruct",
    max_new_tokens=200,
    device=device # Explicitly set device to use GPU if available, otherwise CPU
)

answer = generator(prompt)[0]["generated_text"]

print("\n Final Answer:\n")
print(answer)


 Generating answer...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Device set to use cuda



 Final Answer:


Use ONLY the following evidence:

[Doc 1] WHAT IS BIG DATA
• As Gartner defines it – “Big Data are high volume, high velocity, or high-variety information assets 
that require new forms of processing to enable enhanced decision making, insight discovery, and 
process optimization.”
• The term ‘big data’ is self-explanatory − a collection of huge data sets that normal computing 
techniques cannot process. 
• The term not only refers to the data, but also to the various frameworks, tools, and techniques 
involved. 
• Technological advancement and the advent of new channels of communication (like social 
networking) and new, stronger devices have presented a challenge to industry players in the sense 
that they have to find other ways to handle the data.
• Big data is an all-inclusive term, representing the enormous volume of complex data sets that 
companies and governments generate in the present-day digital environment.
• Big data, typically measured in petabytes or t