In [8]:
documents = {}
file_list = ['/content/sample1.txt', '/content/sample2.txt', '/content/sample3.txt', '/content/sample4.txt', '/content/sample5.txt']

for i, file_path in enumerate(file_list):
    with open(file_path, 'r') as f:
        documents[i + 1] = f.read()

In [9]:
import re

stopwords = {'is', 'the', 'of', 'and', 'a', 'in', 'on', 'to', 'from'}

def preprocess(text):
    text = text.lower()
    tokens = re.findall(r'\b[a-z]+\b', text)
    return [word for word in tokens if word not in stopwords]


In [11]:
import pprint
import pandas as pd

def build_index(docs):
    dictionary = set()
    inverted_index = {}

    for doc_id, text in docs.items():
        tokens = preprocess(text)
        for word in tokens:
            dictionary.add(word)
            if word not in inverted_index:
                inverted_index[word] = set()
            inverted_index[word].add(doc_id)

    return sorted(list(dictionary)), inverted_index

dictionary, inverted_index = build_index(documents)

print("Dictionary:")
display(pd.DataFrame(dictionary, columns=["Words"]))

print("\nInverted Index:")
pprint.pprint(inverted_index)

Dictionary:


Unnamed: 0,Words
0,about
1,account
2,added
3,advanced
4,agencies
...,...
121,vegetables
122,vendors
123,version
124,week



Inverted Index:
{'about': {1},
 'account': {3},
 'added': {4},
 'advanced': {5},
 'agencies': {5},
 'also': {1, 4, 5},
 'android': {2},
 'announced': {1},
 'any': {1},
 'app': {3},
 'apple': {1},
 'are': {1, 2, 3, 4, 5},
 'astronomers': {5},
 'available': {2, 4},
 'battery': {1},
 'best': {4},
 'better': {1},
 'brighter': {5},
 'bugs': {3},
 'camera': {1},
 'can': {3},
 'carrots': {4},
 'choose': {4},
 'clusters': {5},
 'collaborate': {5},
 'continue': {2},
 'core': {5},
 'crashes': {3},
 'customer': {1, 3},
 'customers': {4},
 'customization': {2},
 'device': {1, 2},
 'devices': {2, 3},
 'discovered': {5},
 'distant': {5},
 'enjoy': {2},
 'excited': {1, 5},
 'explore': {5},
 'farmers': {4},
 'features': {2, 3},
 'fixes': {3},
 'for': {1, 2, 4},
 'fresh': {4},
 'fruits': {4},
 'galaxy': {2, 5},
 'get': {2},
 'happy': {3, 4},
 'harvest': {4},
 'has': {1},
 'help': {2, 3, 4},
 'helps': {1},
 'herbs': {4},
 'improve': {3},
 'improved': {1, 4},
 'improves': {3},
 'includes': {4},
 'ios': 

In [6]:
def boolean_retrieval(query, index):
    query = query.lower().split()
    result = set()

    if 'and' in query:
        words = [w for w in query if w != 'and']
        result = index.get(words[0], set())
        for w in words[1:]:
            result = result.intersection(index.get(w, set()))
    elif 'or' in query:
        words = [w for w in query if w != 'or']
        for w in words:
            result = result.union(index.get(w, set()))
    elif 'not' in query:
        words = [w for w in query if w != 'not']
        all_docs = set(documents.keys())
        result = all_docs - index.get(words[0], set())
    else:
        result = index.get(query[0], set())

    return result

# Example queries
print("Query: 'information AND retrieval' →", boolean_retrieval("information and retrieval", inverted_index))
print("Query: 'retrieval OR ranking' →", boolean_retrieval("retrieval or ranking", inverted_index))
print("Query: 'NOT boolean' →", boolean_retrieval("not boolean", inverted_index))


Query: 'information AND retrieval' → set()
Query: 'retrieval OR ranking' → set()
Query: 'NOT boolean' → {1, 2, 3}
