In [1]:

##################### chunks = paragraphs ####################
import re
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import MinMaxScaler
import PyPDF2

def extract_paragraphs_by_page(pdf_path, paragraph_delimiter="\n\n"):
    """
    Reads each page of the PDF, splits into paragraphs,
    and returns a list of (paragraph, page_number).
    """
    paragraph_page_pairs = []

    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            raw_text = reader.pages[page_num].extract_text()
            if not raw_text:
                continue
            # Replace newlines with spaces and split into paragraphs
            page_text = raw_text.replace('\n', ' ')
            paragraphs = page_text.split(paragraph_delimiter)

            for para in paragraphs:
                para = para.strip()
                if len(para.split()) > 10:  # Consider only substantial paragraphs
                    paragraph_page_pairs.append((para, page_num+1))

    return paragraph_page_pairs

def setup_retrieval(paragraph_page_pairs):
    """
    Creates BM25 and SentenceTransformer embeddings for the paragraphs.
    Returns BM25 model, SentenceTransformer model, embeddings, and paragraph-page mappings.
    """
    paragraphs_only = [pair[0] for pair in paragraph_page_pairs]
    pages_only = [pair[1] for pair in paragraph_page_pairs]

    tokenized = [p.lower().split(" ") for p in paragraphs_only]
    bm25_model = BM25Okapi(tokenized)

    st_model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = st_model.encode(paragraphs_only, convert_to_tensor=True)

    return bm25_model, st_model, embeddings, paragraphs_only, pages_only

def ensemble_retrieval(query, bm25_model, st_model, embeddings, paragraphs, pages, weights=[0.5, 0.5], top_k=3):
    """
    Returns the top_k most relevant paragraphs (with page number) based on 
    an ensemble of BM25 and embedding similarity.
    """
    tokenized_query = query.lower().split(" ")
    bm25_scores = bm25_model.get_scores(tokenized_query)

    query_embedding = st_model.encode(query, convert_to_tensor=True)
    embedding_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0].cpu().numpy()

    scaler1 = MinMaxScaler()
    bm25_norm = scaler1.fit_transform(bm25_scores.reshape(-1, 1)).flatten()

    scaler2 = MinMaxScaler()
    embed_norm = scaler2.fit_transform(embedding_scores.reshape(-1, 1)).flatten()

    final_scores = weights[0] * bm25_norm + weights[1] * embed_norm

    top_indices = np.argsort(final_scores)[::-1][:top_k]

    results = [(paragraphs[i], pages[i], final_scores[i]) for i in top_indices]
    return results


In [2]:
pdf_path = "Aka Book.pdf"  # Replace with your actual PDF file path
query1 = "What is the importance of foods with low glycaemic effects?"
query = "what is the optimal roasting time for cocoa?"

# 1. Extract (paragraph, page) pairs
paragraph_page_pairs = extract_paragraphs_by_page(pdf_path)
print(f"Extracted {len(paragraph_page_pairs)} paragraphs in total.")

# 2. Setup retrieval models
bm25_model, st_model, embeddings, paragraphs, pages = setup_retrieval(paragraph_page_pairs)

# 3. Run ensemble retrieval
top_k = 5
results = ensemble_retrieval(
    query, 
    bm25_model, 
    st_model, 
    embeddings, 
    paragraphs, 
    pages, 
    weights=[0.5, 0.5], 
    top_k=top_k
)

# 4. Print top-k results
for rank, (para, page_num, score) in enumerate(results, start=1):
    print(f"Rank: {rank} | Page: {page_num} | Score: {score:.4f}")
    print(para)
    print("-" * 80)


Extracted 795 paragraphs in total.
Rank: 1 | Page: 96 | Score: 0.9716
56   Chapter 3 Ziegleder and Oberparleiter (1996) have proposed a moisture treatment prior  to roasting. In this, steam is condensed on the nib, resulting in a water addition  of about 15%. This moisture aids the formation of more flavour precursors dur - ing the 10–15 min processing time at 40–60 °C (104–140 °F). After drying to 3% moisture at 98–110 °C (208–230 °F) and roasting, this gives a product with a  more intense flavour compared with normally roasted beans. Mohr et al. (1978) have demonstrated that a slow reduction in moisture con- tent to about 3% followed by a rapid heating to the final roast temperature is the  optimal way of roasting. The highest roasting temperature depends upon the required roast intensity and the equipment used. All types of roaster can be operated over a wide range of roasting conditions. Whole bean roasting was the original method and often used to produce  cocoa masses with delica