In [112]:
import nltk
import os
import string
import logging
import re # Import regular expressions library
from collections import defaultdict , Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Download the 'stopwords' resource
nltk.download('stopwords')
# Initialize the stop words and lemmatizer
STOPWORDS = set(stopwords.words ('english'))
LEMMATIZER = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading Text Files

In [111]:
def load_text_files(folder_path):
    """Reads all files in a folder and returns a dictionary
    with filenames as keys and content as values."""
    data = {}
    doc_id_to_filename = {}
    doc_id = 0

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename  # Map doc_id to filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
            doc_id += 1  # Increment document ID for the next file

    return data, doc_id_to_filename

# Example folder path: Replace this with your folder path
folder_path = "/content/drive/MyDrive/Assignment 2: IR"
data, doc_id_to_filename = load_text_files(folder_path)

# Cleaning Dataset and Tokenize

In [113]:
def clean_text(text):
    """Performs text cleaning: removing special characters, digits, tokenization, stopword removal, and lemmatization."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation using regular expressions
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Keeps only alphanumeric characters and spaces


    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    #remove digits from dataset
    text = re.sub(r'\d+', '', text)

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

    return cleaned_tokens

# Test cleaning function with a sample text
print(clean_text(data[0]))

['ever', 'since', 'installed', 'latest', 'patch', 'phone', 'overheating', 'like', 'crazy', 'look', 'user', 'report', 'overheating', 'issue', 'installing', 'patch', 'requesting', 'assistance', 'loyalty', 'point', 'credited', 'instantly', 'goal', 'met', 'keep', 'eye', 'confirmation', 'email', 'almost', 'rt', 'support', 'confirms', 'loyalty', 'point', 'available', 'immediately', 'email', 'sent', 'upon', 'reaching', 'goal', 'flyhighairlines', 'thanks', 'info', 'super', 'helpful', 'user', 'thanks', 'flyhighairlines', 'provided', 'assistance', 'indicating', 'response', 'helped', 'query', 'flyhighairlines', 'wondering', 'able', 'upgrade', 'seat', 'moment', 'earn', 'enough', 'point', 'waiting', 'period', 'user', 'seek', 'clarification', 'flyhighairlines', 'regarding', 'upgrade', 'seat', 'reaching', 'required', 'point', 'help', 'please', 'dm', 'u', 'information', 'resolve', 'issue', 'customer', 'support', 'offer', 'help', 'via', 'dm', 'asking', 'company', 'discount', 'information', 'better', 'a

# Inverted Index Construction

In [114]:
def build_inverted_index(data):
    """Builds an inverted index from the cleaned text data and tracks term frequencies."""
    inverted_index = defaultdict(set)
    term_frequencies = Counter()  # Track term frequencies
    for doc_id, content in data.items():
        cleaned_tokens = clean_text(content)
        for token in cleaned_tokens:
            inverted_index[token].add(doc_id)
            term_frequencies[token] += 1
    return inverted_index, term_frequencies

# Boolean Query Processing: AND, OR, and NOT Operations:

In [115]:
def boolean_query(query, inverted_index, doc_id_to_filename):
    """Processes a Boolean query and returns the matching documents."""
    tokens = query.lower().split()
    result_set = set(doc_id_to_filename.keys())  # Start with all documents in the dataset

    if 'and' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        result_set = inverted_index.get(terms[0], set())  # Start with the first term
        for term in terms[1:]:
            if term in inverted_index:
                result_set = result_set.intersection(inverted_index[term])
            else:
                result_set = set()  # If a term is missing, no results should be returned

    elif 'or' in tokens:
        result_set = set()  # OR should start with an empty set
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        for term in terms:
            if term in inverted_index:
                result_set = result_set.union(inverted_index[term])

    elif 'not' in tokens:
        term = tokens[1]  # Get the term after 'NOT'
        if term in inverted_index:
            result_set = result_set - inverted_index[term]  # Subtract the documents that contain the term
        else:
            pass  # If the term doesn't exist, do nothing as we subtract nothing

    else:
        # If no 'AND', 'OR', 'NOT' operators, check if single query term exists
        if query in inverted_index:
            result_set = inverted_index[query]
        else:
            result_set = set()  # If query term doesn't exist, return empty set

    # Convert doc_ids to filenames
    result_filenames = [doc_id_to_filename[doc_id] for doc_id in result_set if doc_id in doc_id_to_filename]

    logging.info(f"Query '{query}' resulted in: {result_filenames}")

    return result_filenames

In [128]:
def main():
    # Load dataset
    folder_path = "/content/drive/MyDrive/Assignment 2: IR"
    data, doc_id_to_filename = load_text_files(folder_path)

    # Build the inverted index and term frequencies
    inverted_index, term_frequencies = build_inverted_index(data)

    # Example queries
    queries = [
        "Company AND Customer",
        "Phone OR Information",
        "NOT Discount",
    ]

    # Open a file to write the results
    with open("Pawan_results.txt", "w") as result_file:
        for query in queries:
            result = boolean_query(query, inverted_index, doc_id_to_filename)
            result_str = f"Results for '{query}': {result}\n"
            print(result_str)  # Print to console
            result_file.write(result_str)  # Write to file

if __name__ == "__main__":
    main()


Results for 'Company AND Customer': ['doc.4.txt', 'doc.6.txt', 'doc.3.txt', 'doc.1.txt']

Results for 'Phone OR Information': ['doc.4.txt', 'doc.1.txt', 'doc.2.txt']

Results for 'NOT Discount': ['doc.5.txt', 'doc.1.txt', 'doc.7.txt', 'doc.2.txt']

