# Preprocessing

In [69]:
import os
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [71]:
# Function to convert text to lowercase
def convert_to_lowercase(directory):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        content = content.lower()
        with open(file_path, 'w') as file:
            file.write(content)
    print("All documents have been converted to lowercase.")

In [73]:
# Function to load stopwords from a file
def load_stopwords(stopword_file):
    stopwords = []
    with open(stopword_file, 'r') as file:
        for line in file:
            cleaned_word = line.strip()
            if cleaned_word:
                stopwords.append(cleaned_word)
    print("Stopwords array created!")
    return stopwords

In [75]:
# Function to remove stopwords from documents
def remove_stopwords(directory, stopwords):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        for word in stopwords:
            content = content.replace(f' {word} ', ' ')
        with open(file_path, 'w') as file:
            file.write(content)
    print("Stopwords removed from all documents!")

In [77]:
# Function to remove punctuations from documents
def remove_punctuations(directory):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        content = re.sub(r'[^A-Za-z]+', ' ', content)
        with open(file_path, 'w') as file:
            file.write(content)
    print("Punctuations removed from all documents!")

In [79]:
# Function to apply Porter Stemmer to documents
def apply_stemming(directory):
    stemmer = PorterStemmer()
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        tokens = word_tokenize(content)
        stemmed_tokens = [stemmer.stem(word) for word in tokens if len(word) > 1]
        stemmed_content = ' '.join(stemmed_tokens)
        with open(file_path, 'w') as file:
            file.write(stemmed_content)
    print("Porter Stemmer applied successfully!")

In [81]:
# Main function to execute all preprocessing steps
def preprocess_documents(directory, stopword_file):
    convert_to_lowercase(directory)
    stopwords = load_stopwords(stopword_file)
    remove_stopwords(directory, stopwords)
    remove_punctuations(directory)
    apply_stemming(directory)

In [83]:
abstract='Abstracts'
stopword='Stopword-List.txt'
preprocess_documents(abstract, stopword)

All documents have been converted to lowercase.
Stopwords array created!
Stopwords removed from all documents!
Punctuations removed from all documents!
Porter Stemmer applied successfully!


# Creating Indexes

In [85]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

def get_files(dir):
    files = os.listdir(dir)
    pairs = [(int(f.replace('.txt', '')), f) for f in files]
    pairs.sort()
    return [p[1] for p in pairs]

In [87]:
def save_index(index, file):
    with open(file, 'w') as f:
        for term, docs in index.items():
            f.write(f"{term} : {docs}\n")
    print(f"Index saved to {file}")

In [91]:
files = get_files(abstract)

# Creating Inverted Index

In [93]:
def build_inv_index(dir, files):
    inv_index = {}
    for f in files:
        with open(os.path.join(dir, f), 'r') as file:
            text = file.read()
        words = word_tokenize(text)
        doc_id = int(f.replace('.txt', ''))
        for word in words:
            if word not in inv_index:
                inv_index[word] = [doc_id]
            elif doc_id not in inv_index[word]:
                inv_index[word].append(doc_id)
    return dict(sorted(inv_index.items()))

In [97]:
inv_index = build_inv_index(abstract, files)
print("Terms in inverted index:", len(inv_index))
save_index(inv_index, "inverted_index.txt")

Terms in inverted index: 4227
Index saved to inverted_index.txt


In [101]:
# Search for a term in the inverted index.
def search(index, term):
    stemmer = PorterStemmer()
    stemmed = stemmer.stem(term)
    if stemmed in index:
        print(f"Results for '{term}': {index[stemmed]}")
    else:
        print(f"No results for '{term}'")

In [103]:
search(inv_index, "autoencoders")

Results for 'autoencoders': [187, 273, 279, 325, 333, 405]


# Creating Positional Index

In [105]:
def build_pos_index(dir, files):
    pos_index = {}
    for f in files:
        doc_id = int(f.replace('.txt', ''))
        with open(os.path.join(dir, f), 'r') as file:
            text = file.read()
        terms = word_tokenize(text)
        pos = 0
        for term in terms:
            if term not in pos_index:
                pos_index[term] = {}
            if doc_id not in pos_index[term]:
                pos_index[term][doc_id] = []
            pos_index[term][doc_id].append(pos)
            pos += 1
    return dict(sorted(pos_index.items()))

In [107]:
pos_index = build_pos_index(abstract, files)
print("Terms in positional index:", len(pos_index))
save_index(pos_index, "positional_index.txt")

Terms in positional index: 4227
Index saved to positional_index.txt
