# **Import Libraries and NLTK Setups**

In [40]:
import nltk
nltk . download ('stopwords')
nltk . download ('punkt')
nltk . download ('wordnet')
import os
import string
import logging
import re
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer

STOPWORDS = set( stopwords . words ('english') )
LEMMATIZER = WordNetLemmatizer ()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Loading Text Files**

Reading all .txt files in a folder and returning a dictionary with filenames as keys and content as values.

In [41]:
def load_text_files(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return data, doc_id_to_filename


# **Text Cleaning Process**
Performing text cleaning i.e., removing special characters, tokenization, stop word removal, and lemmatization.

In [42]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    tokens = word_tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]
    return cleaned_tokens


# **Building Inverted Index**
Building an inverted index from the cleaned text data and tracking the term frequencies.

In [43]:
def build_inverted_index(data):
    inverted_index = defaultdict(set)
    term_frequencies = Counter()  # Track term frequencies
    for doc_id, content in data.items():
        cleaned_tokens = clean_text(content)
        for token in cleaned_tokens:
            inverted_index[token].add(doc_id)
            term_frequencies[token] += 1
    return inverted_index, term_frequencies

# **Boolean Queries: AND Operation**

Performing an AND query on the terms using the inverted index.

In [44]:
def boolean_and(terms, inverted_index):
    result_set = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result_set = result_set.intersection(inverted_index.get(term, set()))
    return result_set

# **OR Operation**
Performing an OR query on the terms using the inverted index.

In [45]:
def boolean_or(terms, inverted_index):
    result_set = set()
    for term in terms:
        result_set = result_set.union(inverted_index.get(term, set()))
    return result_set

# **NOT Operation**
Performing a NOT query on the term using the inverted index.

In [46]:
def boolean_not(term, inverted_index, total_docs):
    return set(range(total_docs)) - inverted_index.get(term, set())

# **Processing Boolean Queries**

Processing boolean queries with AND, OR, and NOT operations.

In [47]:
def boolean_query(query, inverted_index, total_docs):
    tokens = query.lower().split()
    if 'and' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        return boolean_and(terms, inverted_index)
    elif 'or' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        return boolean_or(terms, inverted_index)
    elif 'not' in tokens:
        return boolean_not(tokens[1], inverted_index, total_docs)
    else:
        return inverted_index.get(tokens[0], set())

# **Converting doc_ids to Filenames**
Converting doc_ids back to the original filenames.

In [48]:
def convert_doc_ids_to_filenames(result_set, doc_id_to_filename):
    return [doc_id_to_filename[doc_id] for doc_id in result_set if doc_id in doc_id_to_filename]

# **Queries Result Files**
Writing the results of queries to a new txt file in a sperate folder.

In [49]:
import os

def write_query_results(queries, inverted_index, doc_id_to_filename, total_docs):
    folder_path = "/content/drive/MyDrive/Resultfrom_queries"
    os.makedirs(folder_path, exist_ok=True)
    results_file_path = os.path.join(folder_path, "query_results.txt")

    with open(results_file_path, "w") as result_file:
        for query in queries:
            result_set = boolean_query(query, inverted_index, total_docs)
            result_filenames = convert_doc_ids_to_filenames(result_set, doc_id_to_filename)
            result_str = f"Results for '{query}': {result_filenames}\n"
            print(result_str)
            result_file.write(result_str)

# **Main Function**
Defining the folder path (for the uploaded files) and loading text files. Then building inverted index and term frequencies. Setting example queries and processing each query and display the results.

In [50]:
def main():
    # Define folder path (for the uploaded files)
    folder_path = '/content/drive/MyDrive/Week2assignment(IRS)'

    # Load text files
    data, doc_id_to_filename = load_text_files(folder_path)

    # Build inverted index and term frequencies
    inverted_index, term_frequencies = build_inverted_index(data)

    # Example queries
    queries = [
        "Life AND Death",
        "Ambition OR happiness",
        "NOT human"
    ]

    # Process each query and display the results
    write_query_results(queries, inverted_index, doc_id_to_filename, len(data))

if __name__ == "__main__":
    main()

Results for 'Life AND Death': ['sample4.txt']

Results for 'Ambition OR happiness': ['sample5.txt', 'sample3.txt']

Results for 'NOT human': ['sample1.txt', 'sample5.txt', 'sample4.txt', 'sample3.txt']

