Import the Library

In [None]:
# Import the library
import re
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure the required NLTK packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Load the text documents

In [None]:
# Load the text documents
def load_documents():
    documents = {
        1: "What problems and concerns are there in making up descriptive titles?",
        2: "How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information requests?",
        3: "What is information science? Give definitions where possible.",
        4: "Image recognition and any other methods of automatically transforming printed text into computer-ready form.",
        5: "What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retrieval systems?"
    }
    return documents

Text cleaning pipeline

In [None]:
# Text cleaning Pipeline
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

Inverted Index

In [None]:
# 4. Inverted Index Creation
def create_indexes(documents):
    inverted_index = defaultdict(list)
    reverse_dictionary = defaultdict(list)

    for doc_id, content in documents.items():
        words = clean_text(content)

        # Build inverted index (word -> list of doc_ids)
        for word in words:
            if doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)

        # Build reverse dictionary (doc_id -> list of words)
        reverse_dictionary[doc_id] = words

    return inverted_index, reverse_dictionary

Boolean Query

In [None]:
# 5. Boolean Query Implementation (AND logic)
def boolean_query(query, inverted_index, operator="AND"):
    query_words = clean_text(query)

    # If no query words are present after cleaning, return an empty set
    if not query_words:
        return set()

    # AND Logic: Intersect all the sets of documents
    if operator == "AND":
        result = set(inverted_index[query_words[0]])
        for word in query_words[1:]:
            result = result.intersection(set(inverted_index[word]))
        return result

    # OR Logic: Union all the sets of documents
    elif operator == "OR":
        result = set()
        for word in query_words:
            result = result.union(set(inverted_index[word]))
        return result

    # NOT Logic: Exclude documents containing certain words
    elif operator == "NOT":
        # Get all document IDs
        all_docs = set(inverted_index.keys())
        # Start with all documents and remove those that contain the query words
        result = all_docs.copy()
        for word in query_words:
            result = result.difference(set(inverted_index[word]))
        return result

    # If no valid operator is provided, return an empty set
    else:
        return set()

main function

In [None]:
if __name__ == "__main__":
    # Load the documents
    documents = load_documents()

    # Create both the inverted index and reverse dictionary
    inverted_index, reverse_dictionary = create_indexes(documents)

    # Display the inverted index
    print("Inverted Index:")
    for word, doc_ids in inverted_index.items():
        print(f"{word}: {doc_ids}")

    # Display the reverse dictionary
    print("\nReverse Dictionary:")
    for doc_id, words in reverse_dictionary.items():
        print(f"Document {doc_id}: {words}")

    # Perform Boolean queries with AND, OR, and NOT logic

    # AND query example: Find documents containing both 'problems' and 'concerns'
    query = "problems concerns"
    result = boolean_query(query, inverted_index, operator="AND")
    print(f"\nDocuments matching AND query '{query}': {result}")

    # OR query example: Find documents containing either 'data' or 'retrieved'
    query_or = "data retrieved"
    result_or = boolean_query(query_or, inverted_index, operator="OR")
    print(f"\nDocuments matching OR query '{query_or}': {result_or}")

    # NOT query example: Find documents NOT containing 'science'
    query_not = "science"
    result_not = boolean_query(query_not, inverted_index, operator="NOT")
    print(f"\nDocuments matching NOT query '{query_not}': {result_not}")

Inverted Index:
problems: [1]
concerns: [1]
making: [1]
descriptive: [1]
titles: [1]
actually: [2]
pertinent: [2]
data: [2]
opposed: [2]
references: [2]
entire: [2]
articles: [2]
retrieved: [2]
automatically: [2, 4]
response: [2]
information: [2, 3, 5]
requests: [2]
science: [3]
give: [3]
definitions: [3]
possible: [3]
image: [4]
recognition: [4]
methods: [4]
transforming: [4]
printed: [4]
text: [4]
computerready: [4]
form: [4]
special: [5]
training: [5]
ordinary: [5]
researchers: [5]
businessmen: [5]
need: [5]
proper: [5]
management: [5]
unobstructed: [5]
use: [5]
retrieval: [5]
systems: [5]

Reverse Dictionary:
Document 1: ['problems', 'concerns', 'making', 'descriptive', 'titles']
Document 2: ['actually', 'pertinent', 'data', 'opposed', 'references', 'entire', 'articles', 'retrieved', 'automatically', 'response', 'information', 'requests']
Document 3: ['information', 'science', 'give', 'definitions', 'possible']
Document 4: ['image', 'recognition', 'methods', 'automatically', 'trans