In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries


In [32]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import os
import re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import logging

# Initialize stopwords and lemmatizer
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading Files

In [33]:
def load_text_files(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return data, doc_id_to_filename


# Text Cleaning:

In [34]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    tokens = word_tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]
    return cleaned_tokens


# Creating Inverted Index

In [35]:
def build_inverted_index(data):
    inverted_index = defaultdict(set)
    term_frequencies = Counter()  # Track term frequencies
    for doc_id, content in data.items():
        cleaned_tokens = clean_text(content)
        for token in cleaned_tokens:
            inverted_index[token].add(doc_id)
            term_frequencies[token] += 1
    return inverted_index, term_frequencies


# Boolean Operators

In [41]:
#AND Operation
def boolean_and(terms, inverted_index):
    result_set = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result_set = result_set.intersection(inverted_index.get(term, set()))
    return result_set


#OR Operation
def boolean_or(terms, inverted_index):
    result_set = set()
    for term in terms:
        result_set = result_set.union(inverted_index.get(term, set()))
    return result_set

#NOT Operation
def boolean_not(term, inverted_index, total_docs):
    return set(range(total_docs)) - inverted_index.get(term, set())



# Query Processor:

In [36]:
def boolean_query(query, inverted_index, total_docs):
    tokens = query.lower().split()
    if 'and' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        return boolean_and(terms, inverted_index)
    elif 'or' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        return boolean_or(terms, inverted_index)
    elif 'not' in tokens:
        return boolean_not(tokens[1], inverted_index, total_docs)
    else:
        return inverted_index.get(tokens[0], set())


# Converting Doc IDs to Filenames:

In [37]:
def convert_doc_ids_to_filenames(result_set, doc_id_to_filename):
    return [doc_id_to_filename[doc_id] for doc_id in result_set if doc_id in doc_id_to_filename]


# Writing Query Results to File:

In [38]:
def write_query_results(queries, inverted_index, doc_id_to_filename, total_docs):
    # Define folder and file dynamically
    folder_path = "/content/drive/MyDrive/IR(Assignment)/result"
    results_file_path = os.path.join(folder_path, "query_results.txt")

    with open(results_file_path, "w") as result_file:
        for query in queries:
            result_set = boolean_query(query, inverted_index, total_docs)
            result_filenames = convert_doc_ids_to_filenames(result_set, doc_id_to_filename)
            result_str = f"Results for '{query}': {result_filenames}\n"
            print(result_str)
            result_file.write(result_str)


# Main Function:


In [39]:
def main():
    # Define folder path (for the uploaded files)
    folder_path = '/content/drive/MyDrive/IR(Assignment)/Documents'

    # Load text files
    data, doc_id_to_filename = load_text_files(folder_path)

    # Build inverted index and term frequencies
    inverted_index, term_frequencies = build_inverted_index(data)

    #Getting the total number of document
    total_docs = len(data)
    # Sample queries
    queries = [
        "AI AND computer science",
        "Python OR Machine Learning",
        "NOT Boolean retrieval",
        "Information retrieval AND NOT web development"
    ]

    # Process each query and display the results
    for query in queries:
        result_set = boolean_query(query, inverted_index, len(data))
        result_filenames = convert_doc_ids_to_filenames(result_set, doc_id_to_filename)
        print(f"Results for '{query}': {result_filenames}")

    write_query_results(queries, inverted_index, doc_id_to_filename, total_docs)

if __name__ == "__main__":
    main()



Results for 'AI AND computer science': []
Results for 'Python OR Machine Learning': ['document1.txt', 'document2.txt', 'document3.txt']
Results for 'NOT Boolean retrieval': ['document1.txt', 'document2.txt', 'document3.txt', 'document5.txt']
Results for 'Information retrieval AND NOT web development': []
Results for 'AI AND computer science': []

Results for 'Python OR Machine Learning': ['document1.txt', 'document2.txt', 'document3.txt']

Results for 'NOT Boolean retrieval': ['document1.txt', 'document2.txt', 'document3.txt', 'document5.txt']

Results for 'Information retrieval AND NOT web development': []

