In [1]:
!pip install nltk jellyfish pyspellchecker

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# TEXT PREPROCESSING PIPELINE
 
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import jellyfish
from spellchecker import SpellChecker

# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_text(text):
    print("\n Original text snippet:", text[:60], "...")
    #tokenize
    tokens = nltk.word_tokenize(text.lower())
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered = [w for w in tokens if w.isalnum() and w not in stop_words]

    # stemming
    ps = PorterStemmer()
    stemmed = [ps.stem(w) for w in filtered]

    # soundex encoding
    soundex_codes = [jellyfish.soundex(w) for w in stemmed]

    return stemmed, soundex_codes

def read_corpus(folder_path):
    corpus = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):   # only text files
            filepath = os.path.join(folder_path, filename)
            print(f"\n Processing file: {filename}")
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
                processed, soundexed = preprocess_text(text)
                corpus[filename] = {
                    "original": text,
                    "processed": processed,
                    "soundex": soundexed
                }
    return corpus


#SPELLING CORRECTION

def correct_query(query, vocab):
    spell = SpellChecker()
    tokens = nltk.word_tokenize(query.lower())
    corrected = []

    for word in tokens:
        if word not in vocab:  
            suggestion = spell.correction(word)
            if suggestion is None:  # fallback using levenshtein distance
                # find word in vocab with min distance
                min_dist = float("inf")
                best = word
                for v in vocab:
                    d = jellyfish.levenshtein_distance(word, v)
                    if d < min_dist:
                        min_dist = d
                        best = v
                corrected.append(best)
            else:
                corrected.append(suggestion)
        else:
            corrected.append(word)
    return corrected



if __name__ == "__main__":
    folder = "Corpus"  # folder name here
    data = read_corpus(folder)

    # build vocab from all processed words
    vocab = set()
    for doc in data.values():
        vocab.update(doc["processed"])

    # testing query correction
    query = "enviroment protecion"
    print("original query:", query)
    print("corrected query:", correct_query(query, vocab))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navu2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\navu2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navu2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



 Processing file: Adobe.txt

 Original text snippet: what is adobe?

The company was founded in 1982 by John Warn ...

 Processing file: Amazon.txt

 Original text snippet: What is amazon?

Amazon.com, online retailer, manufacturer o ...

 Processing file: apple.txt

 Original text snippet: what is apple?

Apple Inc. is an American multinational tech ...

 Processing file: Binance.txt

 Original text snippet: What is binance?

The Binance Exchange is a leading cryptocu ...

 Processing file: bing.txt

 Original text snippet: What Is Bing and How to Use It
Google isn't the only search  ...

 Processing file: blackberry.txt

 Original text snippet: what is blackberry?

BlackBerry is a brand of smartphones, t ...

 Processing file: canva.txt

 Original text snippet: What is Canva? A guide to the graphic design platform's feat ...

 Processing file: Dell.txt

 Original text snippet: What is dell?

The company, first named PC’s Limited, was fo ...

 Processing file: Discord.txt

 Original 

In [None]:
# INVERTED INDEX IMPLEMENTATION

import json
import pickle
from collections import defaultdict, Counter

def build_inverted_index(corpus_data):
    print("\n=== BUILDING INVERTED INDEX ===")
    
    # initialize the inverted index
    inverted_index = defaultdict(lambda: {"df": 0, "postings": []})
    
    # process each document
    for doc_id, doc_info in corpus_data.items():
        processed_terms = doc_info["processed"]  # Get stemmed tokens
        
        term_freq = Counter(processed_terms)
        
        print(f"Processing {doc_id}: {len(term_freq)} unique terms")
        
        # Add to inverted index
        for term, freq in term_freq.items():
            inverted_index[term]["postings"].append((doc_id, freq))
            inverted_index[term]["df"] += 1
    
    # Convert defaultdict to regular dict for cleaner output
    inverted_index = dict(inverted_index)
    
    print(f"Index built with {len(inverted_index)} unique terms")
    return inverted_index


def display_index_stats(inverted_index):
    """Display statistics about the inverted index."""
    print("\n=== INDEX STATISTICS ===")
    print(f"Total unique terms: {len(inverted_index)}")
    
    # Find terms with highest document frequency
    top_terms = sorted(inverted_index.items(), 
                      key=lambda x: x[1]["df"], 
                      reverse=True)[:10]
    
    print("\nTop 10 terms by document frequency:")
    for term, info in top_terms:
        print(f"  {term}: appears in {info['df']} documents")
    
    # Sample postings for first few terms
    print("\nSample postings (first 3 terms):")
    for i, (term, info) in enumerate(list(inverted_index.items())[:3]):
        print(f"  {term}:")
        print(f"    df: {info['df']}")
        print(f"    postings: {info['postings']}")


def save_index_json(inverted_index, filename="inverted_index.json"):
    print(f"\n=== SAVING INDEX TO {filename} ===")
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(inverted_index, f, indent=2, ensure_ascii=False)
        print(f"Index successfully saved to {filename}")
        
        # Display file size
        import os
        file_size = os.path.getsize(filename) / 1024  # KB
        print(f"File size: {file_size:.2f} KB")
        
    except Exception as e:
        print(f"Error saving index: {e}")


def load_index_json(filename="inverted_index.json"):
    print(f"\n=== LOADING INDEX FROM {filename} ===")
    
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            inverted_index = json.load(f)
        
        print(f"Index successfully loaded from {filename}")
        print(f"Loaded {len(inverted_index)} terms")
        return inverted_index
        
    except FileNotFoundError:
        print(f"Error: {filename} not found")
        return None
    except Exception as e:
        print(f"Error loading index: {e}")
        return None


def save_index_pickle(inverted_index, filename="inverted_index.pkl"):
    print(f"\n=== SAVING INDEX TO {filename} (PICKLE) ===")
    
    try:
        with open(filename, 'wb') as f:
            pickle.dump(inverted_index, f)
        print(f"Index successfully saved to {filename}")
        
        # Display file size
        import os
        file_size = os.path.getsize(filename) / 1024  # KB
        print(f"File size: {file_size:.2f} KB")
        
    except Exception as e:
        print(f"Error saving index: {e}")


def load_index_pickle(filename="inverted_index.pkl"):
    print(f"\n=== LOADING INDEX FROM {filename} (PICKLE) ===")
    
    try:
        with open(filename, 'rb') as f:
            inverted_index = pickle.load(f)
        
        print(f"Index successfully loaded from {filename}")
        print(f"Loaded {len(inverted_index)} terms")
        return inverted_index
        
    except FileNotFoundError:
        print(f"Error: {filename} not found")
        return None
    except Exception as e:
        print(f"Error loading index: {e}")
        return None


def search_term_in_index(inverted_index, term):
    if term in inverted_index:
        info = inverted_index[term]
        print(f"\nTerm: '{term}'")
        print(f"Document frequency: {info['df']}")
        print(f"Postings: {info['postings']}")
    else:
        print(f"\nTerm '{term}' not found in index")


if __name__ == "__main__":
    
    # Build the inverted index
    print("Building inverted index from corpus data...")
    inverted_index = build_inverted_index(data)  # 'data' from preprocessing step
    
    # Display statistics
    display_index_stats(inverted_index)
    
    # Save the index (both formats)
    save_index_json(inverted_index)
    save_index_pickle(inverted_index)
    
    # Test loading
    print("\n" + "="*50)
    print("TESTING LOAD FUNCTIONALITY")
    
    loaded_index = load_index_json()
    if loaded_index:
        print(f"Loaded index has {len(loaded_index)} terms")
    
    # Test search functionality
    print("\n" + "="*50)
    print("TESTING SEARCH FUNCTIONALITY")
    
    test_terms = ["environ", "protect", "research"] 
    for term in test_terms:
        search_term_in_index(inverted_index, term)

Building inverted index from corpus data...

=== BUILDING INVERTED INDEX ===
Processing Adobe.txt: 309 unique terms
Processing Amazon.txt: 292 unique terms
Processing apple.txt: 225 unique terms
Processing Binance.txt: 244 unique terms
Processing bing.txt: 262 unique terms
Processing blackberry.txt: 344 unique terms
Processing canva.txt: 185 unique terms
Processing Dell.txt: 405 unique terms
Processing Discord.txt: 244 unique terms
Processing flipkart.txt: 238 unique terms
Processing google.txt: 511 unique terms
Processing HP.txt: 282 unique terms
Processing huawei.txt: 271 unique terms
Processing instagram.txt: 247 unique terms
Processing Lenovo.txt: 237 unique terms
Processing levis.txt: 219 unique terms
Processing messenger.txt: 285 unique terms
Processing microsoft.txt: 358 unique terms
Processing motorola.txt: 348 unique terms
Processing nike.txt: 370 unique terms
Processing nokia.txt: 242 unique terms
Processing Ola.txt: 375 unique terms
Processing operating.txt: 243 unique terms

In [4]:
import json
import math

def load_index_json(filename="inverted_index.json"):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def compute_doc_vector_lengths(inverted_index):
    
    doc_vectors = {}  
    for term, info in inverted_index.items():
        for doc_id, tf in info['postings']:
            if doc_id not in doc_vectors:
                doc_vectors[doc_id] = {}
            weight = 1 + math.log10(tf) if tf > 0 else 0
            doc_vectors[doc_id][term] = weight

    doc_lengths = {}
    for doc_id, vec in doc_vectors.items():
        length = math.sqrt(sum(w**2 for w in vec.values()))
        doc_lengths[doc_id] = length
    return doc_lengths

def save_doc_lengths(doc_lengths, filename="doc_lengths.json"):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(doc_lengths, f, indent=2)

inverted_index = load_index_json("inverted_index.json")
doc_lengths = compute_doc_vector_lengths(inverted_index)
save_doc_lengths(doc_lengths, "doc_lengths.json")
print("Document vector lengths saved to doc_lengths.json")

Document vector lengths saved to doc_lengths.json


In [5]:
import json
import math
from collections import Counter
from IPython.display import display, Markdown
import os

# --- Helper functions ---
def preprocess_query(query):
    if isinstance(query, list):
        tokens = query
    else:
        tokens, _ = preprocess_text(query)   # uses earlier preprocessing pipeline
    return tokens

def load_index_json(filename="inverted_index.json"):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_doc_lengths(filename="doc_lengths.json"):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def build_vocab_from_index(inverted_index):
    return set(inverted_index.keys())

def search_vsm(query, inverted_index, doc_lengths, top_k=10):
    N = len(doc_lengths)
    query_terms = preprocess_query(query)
    tf_query = Counter(query_terms)

    query_weights = {}
    for term, tf in tf_query.items():
        if term in inverted_index:
            df = inverted_index[term]['df']
            idf = math.log10(N / df) if df > 0 else 0
            weight = (1 + math.log10(tf)) * idf
            query_weights[term] = weight
        else:
            query_weights[term] = 0

    qlen = math.sqrt(sum(w**2 for w in query_weights.values()))
    if qlen > 0:
        for term in query_weights:
            query_weights[term] /= qlen

    scores = {}
    for term, q_weight in query_weights.items():
        if term in inverted_index:
            for doc_id, tf in inverted_index[term]['postings']:
                d_weight = 1 + math.log10(tf) if tf > 0 else 0
                d_weight /= doc_lengths[doc_id] if doc_lengths[doc_id] > 0 else 1
                scores[doc_id] = scores.get(doc_id, 0) + q_weight * d_weight

    ranked = sorted(scores.items(), key=lambda x: (-x[1], x[0]))
    return ranked[:top_k]

# --- Load index and lengths ---
inverted_index = load_index_json("inverted_index.json")
doc_lengths = load_doc_lengths("doc_lengths.json")
vocab = build_vocab_from_index(inverted_index)

# Directly map doc_id to actual corpus file path
corpus_dir = "corpus"
doc_map = {fname: os.path.join(corpus_dir, fname) for fname in os.listdir(corpus_dir) if fname.endswith(".txt")}

# --- Query interface ---
query = input("Enter your search query: ")
print("user query:", query)
if query:
    # Spell correction
    corrected_tokens = correct_query(query, vocab)
    corrected_query = " ".join(corrected_tokens)
    print("Corrected query:", corrected_query)

    results = search_vsm(corrected_tokens, inverted_index, doc_lengths, top_k=10)

    display(Markdown("## Top 10 Results"))
    for doc_id, score in results:
        link = doc_map.get(doc_id, None)
        if link:
            display(Markdown(f"**[{doc_id}]({link})** — Score: `{score:.4f}`"))
        else:
            display(Markdown(f"**{doc_id}** — Score: `{score:.4f}` (no file found)"))

user query: The company is headquartered in Mountaine View, Californea
Corrected query: the company is headquartered in mountain view , california


## Top 10 Results

**[google.txt](corpus\google.txt)** — Score: `0.0729`

**[Adobe.txt](corpus\Adobe.txt)** — Score: `0.0494`

**[Discord.txt](corpus\Discord.txt)** — Score: `0.0379`

**[yahoo.txt](corpus\yahoo.txt)** — Score: `0.0354`

**[apple.txt](corpus\apple.txt)** — Score: `0.0330`

**[reddit.txt](corpus\reddit.txt)** — Score: `0.0308`

**[HP.txt](corpus\HP.txt)** — Score: `0.0307`

**[youtube.txt](corpus\youtube.txt)** — Score: `0.0294`

**[whatsapp.txt](corpus\whatsapp.txt)** — Score: `0.0246`

**[zomato.txt](corpus\zomato.txt)** — Score: `0.0227`