In [21]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m757.8/981.5 kB[0m [31m23.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=69ee1832e8e7e071024f161f1c3697150c56a50a82bb6bcb4b11ef16c54a5ce0
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


# Settings

In [19]:
# Download the collection
!wget -O documents.tar.xz https://github.com/tderick/Bravehearts1/raw/refs/heads/main/data/documents.tar.xz

# Unzip the collection
!tar -xJvf documents.tar.xz

--2024-12-16 08:50:32--  https://github.com/tderick/Bravehearts1/raw/refs/heads/main/data/documents.tar.xz
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tderick/Bravehearts1/refs/heads/main/data/documents.tar.xz [following]
--2024-12-16 08:50:32--  https://raw.githubusercontent.com/tderick/Bravehearts1/refs/heads/main/data/documents.tar.xz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6776516 (6.5M) [application/octet-stream]
Saving to: ‘documents.tar.xz’


2024-12-16 08:50:33 (99.7 MB/s) - ‘documents.tar.xz’ saved [6776516/6776516]

documents/
documents/atricolandosi.unipi.en.jsonl
document

# Utils classes

In [22]:
import json
import pickle
import re
import string
import time
from pathlib import Path
from typing import List

import nltk
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Download the stopwords
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)


class Preprocessor:

    @staticmethod
    def preprocess(text: str, lang: str = "english") -> List[str]:

        if lang == "all":
            tmp_lang = detect(text)
            lang = "english" if tmp_lang == "en" else "italian"

        if lang not in stopwords.fileids():
            raise ValueError(
                f"Language '{lang}' is not supported. The language \
                should be one of the following: {stopwords.fileids()}"
            )

        # Lowercase the text
        text = text.lower()

        # Replace ampersand with 'and'
        text = text.replace("&", " and ")

        # Normalize special characters (smart quotes, dashes, etc.)
        text = text.translate(str.maketrans("‘’´“”–-", "'''\"\"--"))

        # Remove unnecessary periods in acronyms
        text = re.sub(r"\.(?!(\S[^. ])|\d)", "", text)

        # Remove punctuation and replace with spaces
        text = text.translate(
            str.maketrans(string.punctuation, " " * len(string.punctuation))
        )

        # Tokenize using NLTK (language aware)
        tokens = word_tokenize(text, language=lang)

        # Remove stopwords for the given language
        stop_words = set(stopwords.words(lang))
        tokens = [word for word in tokens if word not in stop_words]

        # Stemming
        stemmer = SnowballStemmer(lang)

        # Stem the tokens
        tokens = [stemmer.stem(token) for token in tokens]

        return tokens

    @staticmethod
    def profile(f):
        def f_timer(*args, **kwargs):
            start = time.time()
            result = f(*args, **kwargs)
            end = time.time()
            elapsed_time = end - start

            if elapsed_time >= 60:  # If the time is more than a minute
                minutes = int(elapsed_time // 60)
                seconds = elapsed_time % 60
                print(f"{f.__name__}: {minutes} min {seconds:.3f} s")
            elif elapsed_time >= 1:  # If the time is more than a second
                print(f"{f.__name__}: {elapsed_time:.3f} s")
            else:  # If the time is less than a second
                print(f"{f.__name__}: {elapsed_time * 1000:.3f} ms")

            return result

        return f_timer


class InvertedIndexManager:

    @staticmethod
    def load_index(input_file: str):

        input_file_path = Path(input_file)
        if not input_file_path.exists():
            raise ValueError(
                f"Input file {input_file} does not exist.\
                     Make sure the path is correct."
            )

        if not input_file_path.is_file():
            raise ValueError(
                f"Input file {input_file} is not a file. \
                    Make sure to provide a file as input."
            )

        # Load the index from the pickle file
        with open(input_file_path, "rb") as f:
            lexicon, inv, doc_index, stats = pickle.load(f)

        return lexicon, inv, doc_index, stats

    @staticmethod
    def save_index(
        output_folder_path: Path,
        lexicon: dict,
        inv_d: dict,
        inv_f: dict,
        doc_index: list,
        stats: dict,
    ):

        # Save the results as pickle files
        with open(f"{output_folder_path}/index.pkl", "wb") as f:
            pickle.dump(
                (lexicon, {"docids": inv_d, "freqs": inv_f}, doc_index, stats),
                f,
            )

        # Save each part to a separate JSONL file
        with open(
            f"{output_folder_path}/lexicon.json", "w", encoding="utf-8"
        ) as lex_file:
            lex_file.write(json.dumps(lexicon))

        with open(
            f"{output_folder_path}/inverted_file.jsonl", "w", encoding="utf-8"
        ) as inv_file:
            inv_file.write(json.dumps({"docids": inv_d, "freqs": inv_f}))

        with open(
            f"{output_folder_path}/doc_index.jsonl", "w", encoding="utf-8"
        ) as doc_file:
            doc_file.write(json.dumps(doc_index, ensure_ascii=False))

        with open(
            f"{output_folder_path}/stats.json", "w", encoding="utf-8"
        ) as stats_file:
            json.dump(stats, stats_file, ensure_ascii=False, indent=4)


In [23]:
text = "This is a sample text for testing the preprocessor function."
tokens = Preprocessor.preprocess(text, lang="all")
print(tokens)

['sampl', 'text', 'test', 'preprocessor', 'function']


# Indexing

In [24]:
import json
from collections import Counter, defaultdict
from pathlib import Path
from typing import Literal

from tqdm.auto import tqdm


class Indexing:

    def __init__(
        self,
        input_folder: str,
        output_folder: str,
        lang: Literal["en", "it", "all"],  # noqa
    ) -> None:

        input_folder_path = Path(input_folder)
        if not input_folder_path.exists():
            raise ValueError(
                f"Input folder {input_folder} does not exist.\
                     Make sure the path is correct."
            )

        if not input_folder_path.is_dir():
            raise ValueError(
                f"Input folder {input_folder} is not a directory. \
                    Make sure to provide a directory as input."
            )

        # Get all files that end with .jsonl
        jsonl_files = list(input_folder_path.glob("*.jsonl"))

        if len(jsonl_files) == 0:
            raise ValueError(
                f"No .jsonl files found in the input folder {input_folder}. \
                    Make sure to provide a folder with .jsonl files."
            )

        # Filter files based on language
        if lang != "all":
            self.lang = "english" if lang == "en" else "italian"
            jsonl_files = [
                file for file in jsonl_files if f".{lang}" in file.suffixes[1:]
            ]
        else:
            self.lang = "all"

        self.input_folder = input_folder
        self.output_folder = output_folder
        self.input_files = jsonl_files

        # Initialize data structures
        # "term": [docid, doc_freq, col_freq] where doc_freq is the number of
        # documents in which the term appears and col_freq is the total number
        # of times the term appears in the collection
        self.lexicon = {}
        self.doc_index = {}  # Document index
        self.inv_d = defaultdict(list)  # TermID to list of DocIDs
        # TermID to list of term frequencies in each DocID
        self.inv_f = defaultdict(list)
        self.termid = 0  # TermID counter

        self.num_docs = 0  # Number of documents
        self.total_dl = 0  # Total document length
        self.total_toks = 0  # Total number of tokens

    @Preprocessor.profile
    def build_index(self):

        # Create the output folder if it does not exist
        output_folder_path = Path(self.output_folder)
        if not output_folder_path.exists():
            output_folder_path.mkdir(parents=True)

        for fileid, file in tqdm(
            enumerate(self.input_files),
            desc="Indexing Files",
            total=len(self.input_files),
        ):
            # Open and read the JSONL file
            with open(file, "r", encoding="utf-8") as file_content:
                for line in file_content:
                    doc = json.loads(line)  # Parse JSON line
                    # Assign a new docid incrementally
                    docid = len(self.doc_index)
                    # Tokenize and preprocess text
                    tokens = Preprocessor.preprocess(doc["text"], self.lang)
                    # Count term frequencies in the document
                    token_tf = Counter(tokens)

                    # Update lexicon, inverted file, and document index
                    for token, tf in token_tf.items():
                        # Add term to lexicon if not already present
                        if token not in self.lexicon:
                            # [termid, doc_freq, col_freq] i.e. termid is the term identifier, # noqa
                            #  doc_freq is the number of documents in which the term appears, # noqa
                            # and col_freq is the total number of times the term appears in the # noqa
                            # collection
                            self.lexicon[token] = [
                                self.termid,
                                0,
                                0,
                            ]
                            # Initialize posting lists
                            self.inv_d[self.termid], self.inv_f[self.termid] = (  # noqa
                                [],
                                [],
                            )  # noqa
                            self.termid += 1  # Increment termid

                        # Update posting lists and term frequency
                        token_id = self.lexicon[token][0]  # Get termid
                        # Add docid to posting list
                        self.inv_d[token_id].append(docid)
                        # Add term frequency in posting list
                        self.inv_f[token_id].append(tf)
                        # Increment document frequency i.e the number of
                        # documents in which the term appears
                        self.lexicon[token][1] += 1
                        # Increment collection frequency i.e the total
                        # number of times the term appears in the collection
                        self.lexicon[token][2] += tf

                    # Update document index
                    doclen = len(tokens)  # Document length
                    self.doc_index[docid] = {
                        "doclen": doclen,
                        "url": doc["url"],
                        "title": doc["title"],
                    }
                    self.total_dl += doclen
                    self.num_docs += 1

        # Properties file with collection statistics
        stats = {
            "num_docs": len(self.doc_index),
            "num_terms": len(self.lexicon),
            "total_tokens": self.total_dl,
        }

        InvertedIndexManager.save_index(
            output_folder_path=output_folder_path,
            lexicon=self.lexicon,
            inv_d=self.inv_d,
            inv_f=self.inv_f,
            doc_index=self.doc_index,
            stats=stats,
        )

        return self.lexicon, {"docids": self.inv_d, "freqs": self.inv_f}, self.doc_index, stats


In [41]:
# Replace argparse with direct assignments

input_folder = "/content/documents"  # Replace with your actual input folder path
output_folder = "/content/index"  # Replace with your actual output folder path
lang = "all"  # Set language as "en", "it", or "all"

# Instantiate and run the Indexing class
indexer = Indexing(input_folder, output_folder, lang)
lexicon, inv, doc_index, stats = indexer.build_index()

Indexing Files:   0%|          | 0/34 [00:00<?, ?it/s]

build_index: 15 min 56.524 s


In [26]:
print("Statistics")
print(stats)

Statistics
{'num_docs': 1160, 'num_terms': 20028, 'total_tokens': 397751}


# Querying

## Querying Interface

In [57]:
import pandas as pd
from ipywidgets import widgets, interact, interact_manual
from IPython.display import display, clear_output


def process_query(query, method, query_processor):
    """
    Function to process the query based on the selected method.

    Parameters:
        query (str): The user query.
        method (str): The processing method ("DAAT" or "TAAT").

    Returns:
        None
    """
    try:
        # Process the query based on the selected method
        if method == "DAAT":
            print("\nProcessing with DAAT...")
            results = query_processor.query_process_daat(query)
        elif method == "TAAT":
            print("\nProcessing with TAAT...")
            results = query_processor.query_process_taat(query)

        # Display results
        if not results:
            print("No results found for the query.")
        else:
            # Display results in a table format using pandas
            df = pd.DataFrame(results)
            display(df)
    except Exception as e:
        print(f"An error occurred: {e}")

def search_engine_ui(query_processor):
    """
    Interactive UI for the search engine.
    """
    # Dropdown to select the processing method
    method_dropdown = widgets.Dropdown(
        options=["DAAT", "TAAT"],
        value="DAAT",
        description="Method:"
    )

    # Text box for the query input
    query_input = widgets.Text(
        value='',
        placeholder='Enter your query',
        description='Query:',
        layout=widgets.Layout(width='50%')
    )

    # Button to process the query
    search_button = widgets.Button(
        description="Search",
        button_style='success',  # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click to search',
        icon='search'
    )

    # Output area to display results
    output_area = widgets.Output()

    # Event handler for the search button
    def on_search_clicked(b):
        with output_area:
            clear_output()  # Clear previous output
            query = query_input.value.strip()
            method = method_dropdown.value

            if not query:
                print("Please enter a query.")
                return

            # Call the processing function
            process_query(query, method, query_processor)

    # Link the button click event to the handler
    search_button.on_click(on_search_clicked)

    # Layout the UI elements
    ui = widgets.VBox([method_dropdown, query_input, search_button, output_area])

    # Display the UI
    display(ui)


## Generic Inverted Index and Generic Query Processor

### Generic Inverted Index

In [58]:
import heapq
import math

class GenericInvertedIndex:

    def __init__(self, lex, inv, doc, stats):
        self.lexicon = lex
        self.inv = inv
        self.doc = doc
        self.stats = stats

    def num_docs(self):
        return self.stats["num_docs"]

    def get_posting(self, termid):
       return GenericInvertedIndex.PostingListIterator(
            self.inv["docids"][termid], self.inv["freqs"][termid], self.doc
        )


    def get_termids(self, tokens):
        return [
            self.lexicon[token][0] for token in tokens if token in self.lexicon
        ]  # noqa

    def get_postings(self, termids):
        return [self.get_posting(termid) for termid in termids]


    class PostingListIterator:
        def __init__(self, docids, freqs, doc):
            """
            Initialize the PostingListIterator.

            Parameters:
                docids (list): List of document IDs where the term appears.
                freqs (list): List of term frequencies in the corresponding documents.
                doc (dict): Document index with metadata for each document.
                doc_freq (int): Document frequency of the term.
                total_docs (int): Total number of documents in the collection.
            """
            self.docids = docids
            self.freqs = freqs
            self.pos = 0
            self.doc = doc

        def docid(self):
            if self.is_end_list():
                return math.inf
            return self.docids[self.pos]

        def score(self):
            if self.is_end_list():
                return math.inf
            return self.freqs[self.pos] / self.doc[self.docid()]["doclen"]

        def next(self, target=None):
            if not target:
                if not self.is_end_list():
                    self.pos += 1
            else:
                if target > self.docids():
                    try:
                        self.pos = self.docids.index(target, self.pos)
                    except ValueError:
                        self.pos = len(self.docids)

        def is_end_list(self):
            return self.pos == len(self.docids)

        def len(self):
            return len(self.docids)

### Generic Querying Processor

In [59]:
import math
from collections import defaultdict
import pandas as pd


class GenericQueryProcessor:

    def __init__(self, index_file):
        self.lex, self.inv, self.doc, self.stats = InvertedIndexManager.load_index(index_file)
        self.inv_index = GenericInvertedIndex(self.lex, self.inv, self.doc, self.stats)

    # Conjunctive processing
    def boolean_and(self, postings):
        results = []
        # We sort the posting lists from the shortest to the longest
        postings = sorted(postings, key=lambda p: p.len())
        # We scan sequentially through the shortest posting list only
        current_docid = postings[0].docid()
        while current_docid != math.inf:
            found = True
            # We look for the current docid is all remaining posting lists
            for posting in postings[1:]:
                posting.next(current_docid)
                if posting.docid() != current_docid:
                    found = False
                    break
            # If the current docid is in all posting lists, we add it to results # noqa
            if found:
                results.append(current_docid)
            # We move forward in the shortest posting list
            postings[0].next()
            current_docid = postings[0].docid()
        return self.prepare_final_result(docids=results)

    def query_process_and(self, query: str, lang: str = "english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.boolean_and(postings)

    # Disjunctive processing
    def min_docid(self, postings):
        min_docid = math.inf
        for p in postings:
            if not p.is_end_list():
                min_docid = min(p.docid(), min_docid)
        return min_docid

    def boolean_or(self, postings):
        results = []
        current_docid = self.min_docid(postings)
        while current_docid != math.inf:
            results.append(current_docid)
            for posting in postings:
                if posting.docid() == current_docid:
                    posting.next()
            current_docid = self.min_docid(postings)
        return self.prepare_final_result(docids=results)

    def query_process_or(self, query: str, lang: str = "english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.boolean_or(postings)

    # TAAT Algorithm
    def taat(self, postings, k=10):
        A = defaultdict(float)
        for posting in postings:
            current_docid = posting.docid()
            while current_docid != math.inf:
                A[current_docid] += posting.score()
                posting.next()
                current_docid = posting.docid()
        top = TopQueue(k)
        for docid, score in A.items():
            top.insert(docid, score)
        result = sorted(top.queue, reverse=True)

        return self.prepare_final_result(scores_docids=result)

    def query_process_taat(self, query, lang="english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.taat(postings)

    # DAAT Algorithm
    def daat(self, postings, k=10):
        top = TopQueue(k)
        current_docid = self.min_docid(postings)
        while current_docid != math.inf:
            score = 0
            next_docid = math.inf
            for posting in postings:
                if posting.docid() == current_docid:
                    score += posting.score()
                    posting.next()
                if not posting.is_end_list():
                    next_docid = posting.docid()
            top.insert(current_docid, score)
            current_docid = next_docid
        result = sorted(top.queue, reverse=True)
        return self.prepare_final_result(scores_docids=result)

    def query_process_daat(self, query: str, lang: str = "english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.daat(postings)

    def prepare_final_result(self, scores_docids=None, docids=None):

        final_result = []
        if docids:
            for docid in docids:
                doc = self.doc[docid]
                final_result.append(
                    {
                        "docid": docid,
                        "title": doc["title"],
                        "url": doc["url"],
                    }
                )
        else:
            for score, docid in scores_docids:
                doc = self.doc[docid]
                final_result.append(
                    {
                        "docid": docid,
                        "title": doc["title"],
                        "url": doc["url"],
                        "score": score,
                    }
                )

        return final_result


### Testing Generic Scoring Function

In [60]:
# Initialize the QueryProcessor with the input folder
input_folder = "/content/index/index.pkl"
query_processor = GenericQueryProcessor(input_folder)

# Run the search engine UI
search_engine_ui(query_processor=query_processor)


VBox(children=(Dropdown(description='Method:', options=('DAAT', 'TAAT'), value='DAAT'), Text(value='', descrip…

## TF-IDF Inverted Index and Generic Query Processor

### TF-IDF Inverted Index

In [61]:
class TFIDFInvertedIndex(GenericInvertedIndex):

    def __init__(self, lex, inv, doc, stats):
        super().__init__(lex, inv, doc, stats)

    def get_posting(self, termid):
        # Find the term corresponding to the given termid
        term = next(key for key, value in self.lexicon.items() if value[0] == termid)

        # Get the doc_freq (document frequency) from the lexicon
        doc_freq = self.lexicon[term][1]

        # Get the total number of documents
        total_docs = self.stats["num_docs"]

        return TFIDFInvertedIndex.PostingListIterator(
            self.inv["docids"][termid],
            self.inv["freqs"][termid],
            self.doc,
            doc_freq,
            total_docs
        )


    class PostingListIterator(GenericInvertedIndex.PostingListIterator):

        def __init__(self, docids, freqs, doc, doc_freq, total_docs):
            super().__init__(docids, freqs, doc)

        def score(self):
            if self.is_end_list():
                    return 0
            docid = self.docids[self.pos]
            tf = self.freqs[self.pos]  # Term frequency in the document
            doclen = self.doc[docid]["doclen"]  # Document length
            if self.doc_freq == 0:  # Avoid division by zero
                return 0
            idf = math.log(self.total_docs / self.doc_freq)
            tf = 1 + (math.log(tf))
            return tf * idf

### TF-IDF Querying Processor

In [55]:
class TFIDFQueryProcessor(GenericQueryProcessor):

    def __init__(self, index_file):
        super().__init__(index_file)
        self.inv_index = TFIDFInvertedIndex(self.lex, self.inv, self.doc, self.stats)

### Testing TF-IDF Scoring Function

In [56]:
# Initialize the QueryProcessor with the input folder
input_folder = "/content/index/index.pkl"
query_processor = TFIDFQueryProcessor(input_folder)

# Run the search engine UI
search_engine_ui(query_processor=query_processor)

VBox(children=(Dropdown(description='Method:', options=('DAAT', 'TAAT'), value='DAAT'), Text(value='', descrip…

IV-3. BM25 Inverted Index

In [37]:
class BM25InvertedIndex(GenericInvertedIndex):

  def __init__(self, lex, inv, doc, stats):
      super().__init__(lex, inv, doc, stats)

  class PostingListIterator(GenericInvertedIndex.PostingListIterator):
    def score(self):
        if self.is_end_list():
            return 0
        docid = self.docids[self.pos]
        tf = self.freqs[self.pos]  # Term frequency in the document
        doclen = self.doc[docid]["doclen"]  # Document length
        idf = math.log((self.total_docs - self.doc_freq + 0.5) / (self.doc_freq + 0.5) + 1)

        # BM25 score calculation
        numerator = tf * (self.k1 + 1)
        denominator = tf + (self.k1 * (1 - self.b + (self.b * (doclen / self.avgdoclen))))
        return idf * (numerator / denominator)

In [27]:
import heapq
import math


class InvertedIndex:

    def __init__(self, lex, inv, doc, stats):
        self.lexicon = lex
        self.inv = inv
        self.doc = doc
        self.stats = stats

    def num_docs(self):
        return self.stats["num_docs"]

    def get_posting(self, termid):
        # Find the term corresponding to the given termid
        term = next(key for key, value in self.lexicon.items() if value[0] == termid)

        # Get the doc_freq (document frequency) from the lexicon
        doc_freq = self.lexicon[term][1]

        # Get the total number of documents
        total_docs = self.stats["num_docs"]

        # Calculate the Avg Document Length
        avgdoclen = self.stats["total_tokens"] / self.stats["num_docs"]

        return InvertedIndex.PostingListIterator(
            self.inv["docids"][termid],
            self.inv["freqs"][termid],
            self.doc,
            doc_freq,
            total_docs,
            avgdoclen
        )


    def get_termids(self, tokens):
        return [
            self.lexicon[token][0] for token in tokens if token in self.lexicon
        ]  # noqa

    def get_postings(self, termids):
        return [self.get_posting(termid) for termid in termids]


    class PostingListIterator:
        def __init__(self, docids, freqs, doc, doc_freq, total_docs, avgdoclen, k1 = 1.2, b = 0.75):
            """
            Initialize the PostingListIterator.

            Parameters:
                docids (list): List of document IDs where the term appears.
                freqs (list): List of term frequencies in the corresponding documents.
                doc (dict): Document index with metadata for each document.
                doc_freq (int): Document frequency of the term.
                total_docs (int): Total number of documents in the collection.
            """
            self.docids = docids
            self.freqs = freqs
            self.pos = 0
            self.doc = doc
            self.doc_freq = doc_freq
            self.total_docs = total_docs
            self.avgdoclen = avgdoclen
            self.k1 = k1
            self.b = b

        def docid(self):
            if self.is_end_list():
                return math.inf
            return self.docids[self.pos]

        def score(self):
            # if self.is_end_list():
            #     return 0
            # docid = self.docids[self.pos]
            # tf = self.freqs[self.pos]  # Term frequency in the document
            # doclen = self.doc[docid]["doclen"]  # Document length
            # if self.doc_freq == 0:  # Avoid division by zero
            #     return 0
            # idf = math.log(self.total_docs / self.doc_freq)
            # tf = 1 + (math.log(tf))
            # return tf * idf

            if self.is_end_list():
                return 0
            docid = self.docids[self.pos]
            tf = self.freqs[self.pos]  # Term frequency in the document
            doclen = self.doc[docid]["doclen"]  # Document length
            idf = math.log((self.total_docs - self.doc_freq + 0.5) / (self.doc_freq + 0.5) + 1)

            # BM25 score calculation
            numerator = tf * (self.k1 + 1)
            denominator = tf + (self.k1 * (1 - self.b + (self.b * (doclen / self.avgdoclen))))
            return idf * (numerator / denominator)

        def next(self, target=None):
            if not target:
                if not self.is_end_list():
                    self.pos += 1
            else:
                if target > self.docids():
                    try:
                        self.pos = self.docids.index(target, self.pos)
                    except ValueError:
                        self.pos = len(self.docids)

        def is_end_list(self):
            return self.pos == len(self.docids)

        def len(self):
            return len(self.docids)

In [34]:
class TopQueue:
    def __init__(self, k=10, threshold=0.0):
        self.queue = []
        self.k = k
        self.threshold = threshold

    def size(self):
        return len(self.queue)

    def would_enter(self, score):
        return score > self.threshold

    def clear(self, new_threshold=None):
        self.queue = []
        if new_threshold:
            self.threshold = new_threshold

    def __repr__(self):
        return f"<{self.size()} items, th={self.threshold} {self.queue}"

    def insert(self, docid, score):
        if score > self.threshold:
            if self.size() >= self.k:
                heapq.heapreplace(self.queue, (score, docid))
            else:
                heapq.heappush(self.queue, (score, docid))
            if self.size() >= self.k:
                self.threshold = max(self.threshold, self.queue[0][0])
            return True
        return False

In [38]:
import math
from collections import defaultdict
import pandas as pd


class QueryProcessor:

    def __init__(self, index_file):
        lex, inv, doc, stats = InvertedIndexManager.load_index(index_file)
        self.inv_index = BM25InvertedIndex(lex, inv, doc, stats)
        self.doc = doc

    # Conjunctive processing
    def boolean_and(self, postings):
        results = []
        # We sort the posting lists from the shortest to the longest
        postings = sorted(postings, key=lambda p: p.len())
        # We scan sequentially through the shortest posting list only
        current_docid = postings[0].docid()
        while current_docid != math.inf:
            found = True
            # We look for the current docid is all remaining posting lists
            for posting in postings[1:]:
                posting.next(current_docid)
                if posting.docid() != current_docid:
                    found = False
                    break
            # If the current docid is in all posting lists, we add it to results # noqa
            if found:
                results.append(current_docid)
            # We move forward in the shortest posting list
            postings[0].next()
            current_docid = postings[0].docid()
        return self.prepare_final_result(docids=results)

    def query_process_and(self, query: str, lang: str = "english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.boolean_and(postings)

    # Disjunctive processing
    def min_docid(self, postings):
        min_docid = math.inf
        for p in postings:
            if not p.is_end_list():
                min_docid = min(p.docid(), min_docid)
        return min_docid

    def boolean_or(self, postings):
        results = []
        current_docid = self.min_docid(postings)
        while current_docid != math.inf:
            results.append(current_docid)
            for posting in postings:
                if posting.docid() == current_docid:
                    posting.next()
            current_docid = self.min_docid(postings)
        return self.prepare_final_result(docids=results)

    def query_process_or(self, query: str, lang: str = "english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.boolean_or(postings)

    # TAAT Algorithm
    def taat(self, postings, k=10):
        A = defaultdict(float)
        for posting in postings:
            current_docid = posting.docid()
            while current_docid != math.inf:
                A[current_docid] += posting.score()
                posting.next()
                current_docid = posting.docid()
        top = TopQueue(k)
        for docid, score in A.items():
            top.insert(docid, score)
        result = sorted(top.queue, reverse=True)

        return self.prepare_final_result(scores_docids=result)

    def query_process_taat(self, query, lang="english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.taat(postings)

    # DAAT Algorithm
    def daat(self, postings, k=10):
        top = TopQueue(k)
        current_docid = self.min_docid(postings)
        while current_docid != math.inf:
            score = 0
            next_docid = math.inf
            for posting in postings:
                if posting.docid() == current_docid:
                    score += posting.score()
                    posting.next()
                if not posting.is_end_list():
                    next_docid = posting.docid()
            top.insert(current_docid, score)
            current_docid = next_docid
        result = sorted(top.queue, reverse=True)
        return self.prepare_final_result(scores_docids=result)

    def query_process_daat(self, query: str, lang: str = "english"):
        qtokens = set(Preprocessor.preprocess(query, lang))
        qtermids = self.inv_index.get_termids(qtokens)
        postings = self.inv_index.get_postings(qtermids)
        return self.daat(postings)

    def prepare_final_result(self, scores_docids=None, docids=None):

        final_result = []
        if docids:
            for docid in docids:
                doc = self.doc[docid]
                final_result.append(
                    {
                        "docid": docid,
                        "title": doc["title"],
                        "url": doc["url"],
                    }
                )
        else:
            for score, docid in scores_docids:
                doc = self.doc[docid]
                final_result.append(
                    {
                        "docid": docid,
                        "title": doc["title"],
                        "url": doc["url"],
                        "score": score,
                    }
                )

        return final_result


In [39]:

input_folder = "/content/index/index.pkl"
query_processor = QueryProcessor(input_folder)

while True:
  # Prompt the user to select the processing method
  print("\nChoose the processing method:")
  print("1. DAAT (Document-at-a-Time)")
  print("2. TAAT (Term-at-a-Time)")
  print("Type 'exit' to quit.")

  choice = input("Enter your choice (1 or 2): ").strip()

  # Exit condition
  if choice.lower() == "exit":
      print("Exiting the search engine. Goodbye!")
      break

  # Validate choice
  if choice not in ["1", "2"]:
      print("Invalid choice. Please enter 1 for DAAT or 2 for TAAT.")
      continue

  # Prompt the user to input a query
  query = input("\nEnter your query: ").strip()


  try:
      # Process the query based on user choice
      if choice == "1":
          print("\nProcessing with DAAT...")
          results = query_processor.query_process_daat(query)
      elif choice == "2":
          print("\nProcessing with TAAT...")
          results = query_processor.query_process_taat(query)

      # Display results
      if not results:
          print("No results found for the query.")
      else:
          # Display results in a table format using pandas
          df = pd.DataFrame(results)
          print("\nSearch Results:\n")
          print(df.to_string(index=False))
  except Exception as e:
    print(f"An error occurred: {e}")


Choose the processing method:
1. DAAT (Document-at-a-Time)
2. TAAT (Term-at-a-Time)
Type 'exit' to quit.
Enter your choice (1 or 2): unipi
Invalid choice. Please enter 1 for DAAT or 2 for TAAT.

Choose the processing method:
1. DAAT (Document-at-a-Time)
2. TAAT (Term-at-a-Time)
Type 'exit' to quit.
Enter your choice (1 or 2): 1

Enter your query: unipi

Processing with DAAT...

Search Results:

 docid                                                             title                                                                 url    score
   420                              Board of the Ph.D. Programme - Ph.D.     https://www.dm.unipi.it/phd/people/board-of-the-ph-d-programme/ 0.139013
   158                          Coordination - Department of Mathematics   https://www.dm.unipi.it/en/department-of-excellence/coordination/ 0.102564
   422                           Representatives and Secretariat - Ph.D. https://www.dm.unipi.it/phd/people/representatives-and-secretariat/ 0.095890
 

# V- Evaluation