In [8]:
import re
from collections import defaultdict

In [9]:
documents = {
 1: "This is the first document. It contains some text.",
 2: "The second document is longer. It also contains some text.",
 3: "This is the third document. It is different from the first two.",
}

In [10]:
def preprocess_document(doc):
 # Convert to lowercase and tokenize
 tokens = re.findall(r'\w+', doc.lower())
 # Remove stop words (you can define your own list of stop words)
 stop_words = set(["is", "the", "it", "and", "some"])
 tokens = [token for token in tokens if token not in stop_words]
 return tokens

In [11]:
inverted_index = defaultdict(list)
for doc_id, doc_text in documents.items():
    tokens = preprocess_document(doc_text)
    for token in tokens:
        inverted_index[token].append(doc_id)

In [12]:
def retrieve_documents(query):
    query_tokens = preprocess_document(query)
    result = set()
    for token in query_tokens:
        if token in inverted_index:
            result.update(inverted_index[token])
    return list(result)

In [13]:
query = "document contains text"
matching_documents = retrieve_documents(query)

In [14]:
if matching_documents:
    print(f"Matching documents for query '{query}':")
    for doc_id in matching_documents:
        print(f"Document {doc_id}: {documents[doc_id]}")
else:
    print("No matching documents found.")


Matching documents for query 'document contains text':
Document 1: This is the first document. It contains some text.
Document 2: The second document is longer. It also contains some text.
Document 3: This is the third document. It is different from the first two.
