In [18]:
!pip install nltk jellyfish pyspellchecker




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
# importing libraries 
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import jellyfish
from spellchecker import SpellChecker

# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# --- TEXT PREPROCESSING PIPELINE ---

def preprocess_text(text):
    print("\n Original text snippet:", text[:60], "...")
    #tokenize
    tokens = nltk.word_tokenize(text.lower())
    print("Tokens:", tokens[:15])  # show first 15 tokens
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered = [w for w in tokens if w.isalnum() and w not in stop_words]
    print(" After stopword removal:", filtered[:15])

    # stemming
    ps = PorterStemmer()
    stemmed = [ps.stem(w) for w in filtered]
    print("  After stemming:", stemmed[:15])

    # soundex encoding
    soundex_codes = [jellyfish.soundex(w) for w in stemmed]
    print("  Example Soundex:", soundex_codes[:15])

    return stemmed, soundex_codes


# --- READ CORPUS FILES ---

def read_corpus(folder_path):
    corpus = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):   # only text files
            filepath = os.path.join(folder_path, filename)
            print(f"\n Processing file: {filename}")
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
                processed, soundexed = preprocess_text(text)
                corpus[filename] = {
                    "original": text,
                    "processed": processed,
                    "soundex": soundexed
                }
    return corpus


# --- SPELLING CORRECTION FEATURE ---

def correct_query(query, vocab):
    spell = SpellChecker()
    tokens = nltk.word_tokenize(query.lower())
    corrected = []

    for word in tokens:
        if word not in vocab:  
            suggestion = spell.correction(word)
            if suggestion is None:  # fallback using levenshtein distance
                # find word in vocab with min distance
                min_dist = float("inf")
                best = word
                for v in vocab:
                    d = jellyfish.levenshtein_distance(word, v)
                    if d < min_dist:
                        min_dist = d
                        best = v
                corrected.append(best)
            else:
                corrected.append(suggestion)
        else:
            corrected.append(word)
    return corrected



if __name__ == "__main__":
    folder = "Corpus"  # put your folder name here
    data = read_corpus(folder)

    # build vocab from all processed words
    vocab = set()
    for doc in data.values():
        vocab.update(doc["processed"])

    # testing query correction
    query = "enviroment protecion"
    print("original query:", query)
    print("corrected query:", correct_query(query, vocab))



 Processing file: Adobe.txt

 Original text snippet: what is adobe?

The company was founded in 1982 by John Warn ...
Tokens: ['what', 'is', 'adobe', '?', 'the', 'company', 'was', 'founded', 'in', '1982', 'by', 'john', 'warnock', 'and', 'charles']
 After stopword removal: ['adobe', 'company', 'founded', '1982', 'john', 'warnock', 'charles', 'geschke', 'employed', 'xerox', 'corporation', 'palo', 'alto', 'california', 'research']
  After stemming: ['adob', 'compani', 'found', '1982', 'john', 'warnock', 'charl', 'geschk', 'employ', 'xerox', 'corpor', 'palo', 'alto', 'california', 'research']
  Example Soundex: ['A310', 'C515', 'F530', '1000', 'J500', 'W652', 'C640', 'G200', 'E514', 'X620', 'C616', 'P400', 'A430', 'C416', 'R262']

 Processing file: Amazon.txt

 Original text snippet: What is amazon?

Amazon.com, online retailer, manufacturer o ...
Tokens: ['what', 'is', 'amazon', '?', 'amazon.com', ',', 'online', 'retailer', ',', 'manufacturer', 'of', 'electronic', 'book', 'readers', ',']

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Charv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Charv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Charv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 After stopword removal: ['nike', 'nike', 'champion', 'brand', 'builder', 'advertising', 'bo', 'knows', 'finish', 'line', 'moved', 'beyond', 'advertising', 'popular', 'expression']
  After stemming: ['nike', 'nike', 'champion', 'brand', 'builder', 'advertis', 'bo', 'know', 'finish', 'line', 'move', 'beyond', 'advertis', 'popular', 'express']
  Example Soundex: ['N200', 'N200', 'C515', 'B653', 'B436', 'A316', 'B000', 'K500', 'F520', 'L500', 'M100', 'B530', 'A316', 'P146', 'E216']

 Processing file: nokia.txt

 Original text snippet: what is nokia?

Nokia Corporation is a Finnish multinational ...
Tokens: ['what', 'is', 'nokia', '?', 'nokia', 'corporation', 'is', 'a', 'finnish', 'multinational', 'telecommunications', ',', 'information', 'technology', ',']
 After stopword removal: ['nokia', 'nokia', 'corporation', 'finnish', 'multinational', 'telecommunications', 'information', 'technology', 'consumer', 'electronics', 'company', 'founded', 'nokia', 'main', 'headquarters']
  After stemming

In [20]:
# --- INVERTED INDEX IMPLEMENTATION ---

import json
import pickle
from collections import defaultdict, Counter

def build_inverted_index(corpus_data):
    print("\n=== BUILDING INVERTED INDEX ===")
    
    # Initialize the inverted index
    inverted_index = defaultdict(lambda: {"df": 0, "postings": []})
    
    # Process each document
    for doc_id, doc_info in corpus_data.items():
        processed_terms = doc_info["processed"]  # Get stemmed tokens
        
        # Count term frequencies in this document
        term_freq = Counter(processed_terms)
        
        print(f"Processing {doc_id}: {len(term_freq)} unique terms")
        
        # Add to inverted index
        for term, freq in term_freq.items():
            # Add this document to the term's postings list
            inverted_index[term]["postings"].append((doc_id, freq))
            # Increment document frequency for this term
            inverted_index[term]["df"] += 1
    
    # Convert defaultdict to regular dict for cleaner output
    inverted_index = dict(inverted_index)
    
    print(f"Index built with {len(inverted_index)} unique terms")
    return inverted_index


def display_index_stats(inverted_index):
    """Display statistics about the inverted index."""
    print("\n=== INDEX STATISTICS ===")
    print(f"Total unique terms: {len(inverted_index)}")
    
    # Find terms with highest document frequency
    top_terms = sorted(inverted_index.items(), 
                      key=lambda x: x[1]["df"], 
                      reverse=True)[:10]
    
    print("\nTop 10 terms by document frequency:")
    for term, info in top_terms:
        print(f"  {term}: appears in {info['df']} documents")
    
    # Sample postings for first few terms
    print("\nSample postings (first 3 terms):")
    for i, (term, info) in enumerate(list(inverted_index.items())[:3]):
        print(f"  {term}:")
        print(f"    df: {info['df']}")
        print(f"    postings: {info['postings']}")


def save_index_json(inverted_index, filename="inverted_index.json"):
    print(f"\n=== SAVING INDEX TO {filename} ===")
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(inverted_index, f, indent=2, ensure_ascii=False)
        print(f"Index successfully saved to {filename}")
        
        # Display file size
        import os
        file_size = os.path.getsize(filename) / 1024  # KB
        print(f"File size: {file_size:.2f} KB")
        
    except Exception as e:
        print(f"Error saving index: {e}")


def load_index_json(filename="inverted_index.json"):
    """
    Load inverted index from JSON file.
    
    Args:
        filename: Input filename
        
    Returns:
        inverted_index: Loaded index dictionary
    """
    print(f"\n=== LOADING INDEX FROM {filename} ===")
    
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            inverted_index = json.load(f)
        
        print(f"Index successfully loaded from {filename}")
        print(f"Loaded {len(inverted_index)} terms")
        return inverted_index
        
    except FileNotFoundError:
        print(f"Error: {filename} not found")
        return None
    except Exception as e:
        print(f"Error loading index: {e}")
        return None


def save_index_pickle(inverted_index, filename="inverted_index.pkl"):
    """
    Save inverted index to pickle file (more efficient for large indices).
    
    Args:
        inverted_index: The index dictionary
        filename: Output filename
    """
    print(f"\n=== SAVING INDEX TO {filename} (PICKLE) ===")
    
    try:
        with open(filename, 'wb') as f:
            pickle.dump(inverted_index, f)
        print(f"Index successfully saved to {filename}")
        
        # Display file size
        import os
        file_size = os.path.getsize(filename) / 1024  # KB
        print(f"File size: {file_size:.2f} KB")
        
    except Exception as e:
        print(f"Error saving index: {e}")


def load_index_pickle(filename="inverted_index.pkl"):
    """
    Load inverted index from pickle file.
    
    Args:
        filename: Input filename
        
    Returns:
        inverted_index: Loaded index dictionary
    """
    print(f"\n=== LOADING INDEX FROM {filename} (PICKLE) ===")
    
    try:
        with open(filename, 'rb') as f:
            inverted_index = pickle.load(f)
        
        print(f"Index successfully loaded from {filename}")
        print(f"Loaded {len(inverted_index)} terms")
        return inverted_index
        
    except FileNotFoundError:
        print(f"Error: {filename} not found")
        return None
    except Exception as e:
        print(f"Error loading index: {e}")
        return None


def search_term_in_index(inverted_index, term):
    """
    Search for a term in the inverted index.
    
    Args:
        inverted_index: The index dictionary
        term: Term to search for
    """
    if term in inverted_index:
        info = inverted_index[term]
        print(f"\nTerm: '{term}'")
        print(f"Document frequency: {info['df']}")
        print(f"Postings: {info['postings']}")
    else:
        print(f"\nTerm '{term}' not found in index")


# --- MAIN EXECUTION CODE ---
if __name__ == "__main__":
    # This assumes you already have the corpus data from the preprocessing step
    # If running separately, uncomment the lines below:
    
    # folder = "Corpus"
    # corpus_data = read_corpus(folder)
    
    # Build the inverted index
    print("Building inverted index from corpus data...")
    inverted_index = build_inverted_index(data)  # 'data' from preprocessing step
    
    # Display statistics
    display_index_stats(inverted_index)
    
    # Save the index (both formats)
    save_index_json(inverted_index)
    save_index_pickle(inverted_index)
    
    # Test loading
    print("\n" + "="*50)
    print("TESTING LOAD FUNCTIONALITY")
    
    loaded_index = load_index_json()
    if loaded_index:
        print(f"Loaded index has {len(loaded_index)} terms")
    
    # Test search functionality
    print("\n" + "="*50)
    print("TESTING SEARCH FUNCTIONALITY")
    
    # Search for some terms (replace with actual terms from your corpus)
    test_terms = ["environ", "protect", "research"]  # Example stemmed terms
    for term in test_terms:
        search_term_in_index(inverted_index, term)

Building inverted index from corpus data...

=== BUILDING INVERTED INDEX ===
Processing Adobe.txt: 309 unique terms
Processing Amazon.txt: 292 unique terms
Processing apple.txt: 225 unique terms
Processing Binance.txt: 244 unique terms
Processing bing.txt: 262 unique terms
Processing blackberry.txt: 344 unique terms
Processing canva.txt: 185 unique terms
Processing Dell.txt: 405 unique terms
Processing Discord.txt: 244 unique terms
Processing flipkart.txt: 238 unique terms
Processing google.txt: 511 unique terms
Processing HP.txt: 282 unique terms
Processing huawei.txt: 271 unique terms
Processing instagram.txt: 247 unique terms
Processing Lenovo.txt: 237 unique terms
Processing levis.txt: 219 unique terms
Processing messenger.txt: 285 unique terms
Processing microsoft.txt: 358 unique terms
Processing motorola.txt: 348 unique terms
Processing nike.txt: 370 unique terms
Processing nokia.txt: 242 unique terms
Processing Ola.txt: 375 unique terms
Processing operating.txt: 243 unique terms

In [None]:
import json
import math

def load_index_json(filename="inverted_index.json"):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def compute_doc_vector_lengths(inverted_index):
    
    doc_vectors = {}  
    for term, info in inverted_index.items():
        for doc_id, tf in info['postings']:
            if doc_id not in doc_vectors:
                doc_vectors[doc_id] = {}
            weight = 1 + math.log10(tf) if tf > 0 else 0
            doc_vectors[doc_id][term] = weight

    doc_lengths = {}
    for doc_id, vec in doc_vectors.items():
        length = math.sqrt(sum(w**2 for w in vec.values()))
        doc_lengths[doc_id] = length
    return doc_lengths

def save_doc_lengths(doc_lengths, filename="doc_lengths.json"):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(doc_lengths, f, indent=2)

inverted_index = load_index_json("inverted_index.json")
doc_lengths = compute_doc_vector_lengths(inverted_index)
save_doc_lengths(doc_lengths, "doc_lengths.json")
print("Document vector lengths saved to doc_lengths.json")

Document vector lengths saved to doc_lengths.json


In [21]:
import json
import math
from collections import Counter

def preprocess_query(query):
    stemmed, _ = preprocess_text(query)
    return stemmed

def load_index_json(filename="inverted_index.json"):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_doc_lengths(filename="doc_lengths.json"):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def search_vsm(query, inverted_index, doc_lengths, top_k=10):
    N = len(doc_lengths)
    query_terms = preprocess_query(query)
    tf_query = Counter(query_terms)

    # Compute ltc weights for query
    query_weights = {}
    for term, tf in tf_query.items():
        if term in inverted_index:
            df = inverted_index[term]['df']
            idf = math.log10(N / df) if df > 0 else 0
            weight = (1 + math.log10(tf)) * idf
            query_weights[term] = weight
        else:
            query_weights[term] = 0

    # Normalize query vector
    qlen = math.sqrt(sum(w**2 for w in query_weights.values()))
    if qlen > 0:
        for term in query_weights:
            query_weights[term] /= qlen

    # Score each document
    scores = {}
    for term, q_weight in query_weights.items():
        if term in inverted_index:
            for doc_id, tf in inverted_index[term]['postings']:
                # lnc for document: weight = 1 + log10(tf)
                d_weight = 1 + math.log10(tf) if tf > 0 else 0
                d_weight /= doc_lengths[doc_id] if doc_lengths[doc_id] > 0 else 1
                scores[doc_id] = scores.get(doc_id, 0) + q_weight * d_weight

    # Sort by score desc, then doc_id asc
    ranked = sorted(scores.items(), key=lambda x: (-x[1], x[0]))
    return ranked[:top_k]

# --- Example usage ---
# Load index and doc lengths
inverted_index = load_index_json("inverted_index.json")
doc_lengths = load_doc_lengths("doc_lengths.json")

# User input for query
query = input("Enter your search query: ")
results = search_vsm(query, inverted_index, doc_lengths, top_k=10)

print(f"Top results for query: '{query}'")
for doc_id, score in results:
    print(f"{doc_id}: {score:.4f}")


 Original text snippet: mobile and phone ...
Tokens: ['mobile', 'and', 'phone']
 After stopword removal: ['mobile', 'phone']
  After stemming: ['mobil', 'phone']
  Example Soundex: ['M140', 'P500']
Top results for query: 'mobile and phone'
nokia.txt: 0.1660
skype.txt: 0.1429
huawei.txt: 0.1223
whatsapp.txt: 0.1115
messenger.txt: 0.0950
operating.txt: 0.0873
zomato.txt: 0.0820
blackberry.txt: 0.0674
google.txt: 0.0601
instagram.txt: 0.0587
