In [9]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

def load_and_sort_files(directory):
    """
    Load files from the specified directory, extract their numerical IDs, and sort them.
    """
    files = os.listdir(directory)
    file_pairs = [(int(file.replace('.txt', '')), file) for file in files]
    file_pairs.sort()  # Sort by numerical ID
    return [file_pair[1] for file_pair in file_pairs]  # Return sorted filenames

def build_inverted_index(directory, sorted_files):
    """
    Build an inverted index from the sorted files in the specified directory.
    """
    inverted_index = {}
    for file in sorted_files:
        with open(os.path.join(directory, file), 'r') as f:
            content = f.read()
        words = word_tokenize(content)
        doc_id = int(file.replace('.txt', ''))
        for word in words:
            if word not in inverted_index:
                inverted_index[word] = [doc_id]
            elif doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)
    return dict(sorted(inverted_index.items()))

def build_positional_index(directory, sorted_files):
    """
    Build a positional index from the sorted files in the specified directory.
    """
    positional_index = {}
    for file in sorted_files:
        doc_id = int(file.replace('.txt', ''))
        with open(os.path.join(directory, file), 'r') as f:
            content = f.read()
        terms = word_tokenize(content)
        position = 0  # Track term positions
        for term in terms:
            if term not in positional_index:
                positional_index[term] = {}
            if doc_id not in positional_index[term]:
                positional_index[term][doc_id] = []
            positional_index[term][doc_id].append(position)
            position += 1
    return dict(sorted(positional_index.items()))

def save_index_to_file(index, filename):
    """
    Save the index (inverted or positional) to a file.
    """
    with open(filename, 'w') as f:
        for term, doc_ids in index.items():
            f.write(f"{term} : {doc_ids}\n")
    print(f"Index saved to {filename}")

def search_query(inverted_index, query):
    """
    Search for a query in the inverted index after stemming.
    """
    porter_stemmer = PorterStemmer()
    stemmed_query = porter_stemmer.stem(query)
    if stemmed_query in inverted_index:
        print(f"Query Result for '{query}': {inverted_index[stemmed_query]}")
    else:
        print(f"No results found for '{query}'")

# Load and sort files
abstracts_directory = 'Abstracts'
sorted_files = load_and_sort_files(abstracts_directory)

# Build and save inverted index
inverted_index = build_inverted_index(abstracts_directory, sorted_files)
print("Total terms in inverted index:", len(inverted_index))
save_index_to_file(inverted_index, "inverted_index.txt")

# Build and save positional index
positional_index = build_positional_index(abstracts_directory, sorted_files)
print("Total terms in positional index:", len(positional_index))
save_index_to_file(positional_index, "positional_index.txt")

# Search for a query in the inverted index
search_query(inverted_index, "autoencoders")

Total terms in inverted index: 4211
Index saved to inverted_index.txt
Total terms in positional index: 4211
Index saved to positional_index.txt
Query Result for 'autoencoders': [187, 273, 279, 325, 333, 405]


In [13]:









# Load and sort files
folder = 'Abstracts'

# Build and save inverted index


# Build and save positional index


# Search for a term


Terms in inverted index: 4211
Index saved to inverted_index.txt
Terms in positional index: 4211
Index saved to positional_index.txt
Results for 'autoencoders': [187, 273, 279, 325, 333, 405]
