# Preprocessing

In [1]:
import os
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [19]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Muneeb\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Muneeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Function to convert text to lowercase
def convert_to_lowercase(directory):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        content = content.lower()
        with open(file_path, 'w') as file:
            file.write(content)
    print("All documents have been converted to lowercase.")

In [5]:
# Function to load stopwords from a file
def load_stopwords(stopword_file):
    stopwords = []
    with open(stopword_file, 'r') as file:
        for line in file:
            cleaned_word = line.strip()
            if cleaned_word:
                stopwords.append(cleaned_word)
    print("Stopwords array created!")
    return stopwords

In [7]:
# Function to remove stopwords from documents
def remove_stopwords(directory, stopwords):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        for word in stopwords:
            content = content.replace(f' {word} ', ' ')
        with open(file_path, 'w') as file:
            file.write(content)
    print("Stopwords removed from all documents!")

In [23]:
# Function to remove punctuations from documents
def remove_punctuations(directory):
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            content = file.read()
        content = re.sub(r'[^A-Za-z]+', ' ', content)
        with open(file_path, 'w') as file:
            file.write(content)
    print("Punctuations removed from all documents!")

In [21]:
# Function to apply Lemmatization to documents 
def apply_lemmatization(directory):
    lemmatizer = WordNetLemmatizer()
    for file_name in sorted(os.listdir(directory)):
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        tokens = word_tokenize(content)
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if len(word) > 1]
        lemmatized_content = ' '.join(lemmatized_tokens)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(lemmatized_content)
    print("Lemmatization applied successfully!")

In [25]:
# Main function to execute all preprocessing steps
def preprocess_documents(directory, stopword_file):
    convert_to_lowercase(directory)
    stopwords = load_stopwords(stopword_file)
    remove_stopwords(directory, stopwords)
    remove_punctuations(directory)
    apply_lemmatization(directory)

In [27]:
abstract='Abstracts'
stopword='Stopword-List.txt'
preprocess_documents(abstract, stopword)

All documents have been converted to lowercase.
Stopwords array created!
Stopwords removed from all documents!
Punctuations removed from all documents!
Lemmatization applied successfully!


# Creating Indexes

In [32]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

In [34]:
lemmatizer = WordNetLemmatizer()

In [36]:
def get_files(dir):
    files = os.listdir(dir)
    pairs = [(int(f.replace('.txt', '')), f) for f in files]
    pairs.sort()
    return [p[1] for p in pairs]

def save_index(index, file):
    with open(file, 'w') as f:
        for term, docs in index.items():
            f.write(f"{term} : {docs}\n")
    print(f"Index saved to {file}")

In [46]:
#1 Inverted Index
def build_inv_index(dir, files):
    inv_index = {}
    for f in files:
        with open(os.path.join(dir, f), 'r') as file:
            text = file.read()
        words = word_tokenize(text)
        doc_id = int(f.replace('.txt', ''))
        for word in words:
            if word not in inv_index:
                inv_index[word] = [doc_id]
            elif doc_id not in inv_index[word]:
                inv_index[word].append(doc_id)
    return dict(sorted(inv_index.items()))

In [48]:
#2. Term-to-Index Mapping for Vector Space
def build_term_index_map(dir, files):
    term_index = {}
    vocab = set()
    
    # First pass to build vocabulary
    for f in files:
        with open(os.path.join(dir, f), 'r') as file:
            text = file.read()
        words = word_tokenize(text)
        vocab.update(words)
    
    # Create term to index mapping
    for idx, term in enumerate(sorted(vocab)):
        term_index[term] = idx
    
    return term_index

In [50]:
# 3. Vector Space Index (tf-idf vectors)
def build_vector_space_index(dir, files, term_index):
    num_docs = len(files)
    vocab_size = len(term_index)
    doc_vectors = []
    df = {}  # document frequency
    
    # Initialize document frequency counts
    for term in term_index:
        df[term] = 0
    
    # First pass to calculate document frequencies
    for f in files:
        with open(os.path.join(dir, f), 'r') as file:
            text = file.read()
        words = word_tokenize(text)
        unique_words = set(words)
        for term in unique_words:
            if term in term_index:
                df[term] += 1
    
    # Second pass to create vectors
    for f in files:
        doc_id = int(f.replace('.txt', ''))
        with open(os.path.join(dir, f), 'r') as file:
            text = file.read()
        words = word_tokenize(text)
        
        # Initialize document vector
        vector = np.zeros(vocab_size)
        tf = {}  # term frequency in this doc
        
        # Calculate term frequencies
        for word in words:
            if word in term_index:
                if word in tf:
                    tf[word] += 1
                else:
                    tf[word] = 1
        
        # Calculate tf-idf weights
        for term, freq in tf.items():
            term_idx = term_index[term]
            idf = np.log(num_docs / (1 + df[term]))  # smoothed idf
            vector[term_idx] = freq * idf
        
        doc_vectors.append(vector)
    
    return np.array(doc_vectors)

In [52]:
abstract = 'Abstracts'
files = get_files(abstract)

In [54]:
# 1. Building inverted index
inv_index = build_inv_index(abstract, files)
print("Terms in inverted index:", len(inv_index))
save_index(inv_index, "inverted_index.txt")

Terms in inverted index: 5860
Index saved to inverted_index.txt


In [56]:
# 2. Build term-to-index mapping
term_index = build_term_index_map(abstract, files)
print("Vocabulary size:", len(term_index))
save_index(term_index, "term_index_map.txt")

Vocabulary size: 5860
Index saved to term_index_map.txt


In [58]:
 # 3. Build vector space index
vector_space = build_vector_space_index(abstract, files, term_index)
print("Vector space shape:", vector_space.shape)
np.save("vector_space_index.npy", vector_space)

Vector space shape: (448, 5860)


In [62]:
# Example search in vector space
def vector_space_search(term, term_index, vector_space, files):
    if term in term_index:
        term_idx = term_index[term]
        print(f"\nTerm '{term}' found at index {term_idx}")
        print("Document vectors (tf-idf weights) for this term:")
        for doc_idx in range(len(files)):
            weight = vector_space[doc_idx][term_idx]
            if weight > 0:
                print(f"Doc {files[doc_idx]}: {weight:.4f}")
    else:
        print(f"Term '{term}' not found in vocabulary")

In [70]:
vector_space_search("bootstrap", term_index, vector_space, files)


Term 'bootstrap' found at index 594
Document vectors (tf-idf weights) for this term:
Doc 181.txt: 14.1555
Doc 193.txt: 28.3110
Doc 379.txt: 4.7185
