# Preprocessing

In [None]:
import os
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
# Function to convert text to lowercase
def convert_to_lowercase(directory):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        content = content.lower()
        with open(file_path, 'w') as file:
            file.write(content)
    print("All documents have been converted to lowercase.")

In [5]:
# Function to load stopwords from a file
def load_stopwords(stopword_file):
    stopwords = []
    with open(stopword_file, 'r') as file:
        for line in file:
            cleaned_word = line.strip()
            if cleaned_word:
                stopwords.append(cleaned_word)
    print("Stopwords array created!")
    return stopwords

In [7]:
# Function to remove stopwords from documents
def remove_stopwords(directory, stopwords):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        for word in stopwords:
            content = content.replace(f' {word} ', ' ')
        with open(file_path, 'w') as file:
            file.write(content)
    print("Stopwords removed from all documents!")

In [9]:
# Function to handle edge cases using regex
def handle_edge_cases(directory, stopwords):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        for word in stopwords:
            pattern = r'\b' + re.escape(word) + r'\b'
            content = re.sub(pattern, ' ', content)
        with open(file_path, 'w') as file:
            file.write(content)
    print("Edge cases handled!")

In [11]:
# Function to remove punctuations from documents
def remove_punctuations(directory):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        content = re.sub(r'[^A-Za-z]+', ' ', content)
        with open(file_path, 'w') as file:
            file.write(content)
    print("Punctuations removed from all documents!")

In [13]:
# Function to apply Porter Stemmer to documents
def apply_stemming(directory):
    stemmer = PorterStemmer()
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        tokens = word_tokenize(content)
        stemmed_tokens = [stemmer.stem(word) for word in tokens if len(word) > 1]
        stemmed_content = ' '.join(stemmed_tokens)
        with open(file_path, 'w') as file:
            file.write(stemmed_content)
    print("Porter Stemmer applied successfully!")

In [15]:
# Main function to execute all preprocessing steps
def preprocess_documents(directory, stopword_file):
    convert_to_lowercase(directory)
    stopwords = load_stopwords(stopword_file)
    remove_stopwords(directory, stopwords)
    handle_edge_cases(directory, stopwords)
    remove_punctuations(directory)
    apply_stemming(directory)

In [17]:
abstract='Abstracts'
stopword='Stopword-List.txt'
preprocess_documents(abstract, stopword)

All documents have been converted to lowercase.
Stopwords array created!
Stopwords removed from all documents!
Edge cases handled!
Punctuations removed from all documents!
Porter Stemmer applied successfully!


# Creating Inverted Index

In [44]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict

In [46]:
nltk.download('punkt')
abstracts = 'Abstracts'

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Muneeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
files = sorted([file for file in os.listdir(abstracts) if file.endswith('.txt')],
               key=lambda x: int(x.replace('.txt', '')))

In [50]:
#Build inverted index
inverted_index = defaultdict(list)
for file in files:
    with open(os.path.join(abstracts, file), 'r') as f:
        doc_id = int(file.replace('.txt', ''))
        for word in word_tokenize(f.read()):
            if doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)

In [52]:
#Save sorted inverted index to file
with open('inverted_index.txt', 'w') as f:
    for word, doc_ids in sorted(inverted_index.items()):
        f.write(f"{word}: {', '.join(map(str, doc_ids))}\n")

In [54]:
print("Total terms in index:", len(inverted_index))

Total terms in index: 4211


In [56]:
#Query processing with Porter Stemmer
porter_stemmer = PorterStemmer()
query = "autoencoders"
stemmed_query = porter_stemmer.stem(query)
print(f"Stemmed Query: {stemmed_query}")

if stemmed_query in inverted_index:
    print("Query result:", inverted_index[stemmed_query])
else:
    print("No results found")

Stemmed Query: autoencod
Query result: [187, 273, 279, 325, 333, 405]


# Creating Positional Index

In [58]:
#Sort files by their numeric prefix
files = sorted([file for file in os.listdir(abstracts) if file.endswith('.txt')],
               key=lambda x: int(x.replace('.txt', '')))

In [60]:
#Build the positional index
positional_index = defaultdict(lambda: defaultdict(list))

for file in files:
    with open(os.path.join(abstracts, file), 'r') as f:
        doc_id = int(file.replace('.txt', ''))
        terms = word_tokenize(f.read())
        position = 0  # Track positions of terms
        for term in terms:
            positional_index[term][doc_id].append(position)
            position += 1

In [62]:
#save the positional index to a file
with open('positional_Index.txt', 'w') as f:
    for term, doc_ids in sorted(positional_index.items()):
        f.write(f"{term}: {dict(doc_ids)}\n")
print("Positional Index Saved")

Positional Index Saved


In [64]:
print("Total terms in index:", len(positional_index))

Total terms in index: 4211
