In [2]:
!pip install nltk jellyfish pyspellchecker

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting jellyfish
  Downloading jellyfish-1.2.0-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Collecting click (from nltk)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.9.1-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 10.7 MB/s eta 0:00:00
Downloading jellyfish-1.2.0-cp313-cp313-win_amd64.whl (216 kB)
Downloading pys


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# importing libraries 
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import jellyfish
from spellchecker import SpellChecker

# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# --- TEXT PREPROCESSING PIPELINE ---

def preprocess_text(text):
    print("\n Original text snippet:", text[:60], "...")
    #tokenize
    tokens = nltk.word_tokenize(text.lower())
    print("Tokens:", tokens[:15])  # show first 15 tokens
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered = [w for w in tokens if w.isalnum() and w not in stop_words]
    print(" After stopword removal:", filtered[:15])

    # stemming
    ps = PorterStemmer()
    stemmed = [ps.stem(w) for w in filtered]
    print("  After stemming:", stemmed[:15])

    # soundex encoding
    soundex_codes = [jellyfish.soundex(w) for w in stemmed]
    print("  Example Soundex:", soundex_codes[:15])

    return stemmed, soundex_codes


# --- READ CORPUS FILES ---

def read_corpus(folder_path):
    corpus = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):   # only text files
            filepath = os.path.join(folder_path, filename)
            print(f"\n Processing file: {filename}")
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
                processed, soundexed = preprocess_text(text)
                corpus[filename] = {
                    "original": text,
                    "processed": processed,
                    "soundex": soundexed
                }
    return corpus


# --- SPELLING CORRECTION FEATURE ---

def correct_query(query, vocab):
    spell = SpellChecker()
    tokens = nltk.word_tokenize(query.lower())
    corrected = []

    for word in tokens:
        if word not in vocab:  
            suggestion = spell.correction(word)
            if suggestion is None:  # fallback using levenshtein distance
                # find word in vocab with min distance
                min_dist = float("inf")
                best = word
                for v in vocab:
                    d = jellyfish.levenshtein_distance(word, v)
                    if d < min_dist:
                        min_dist = d
                        best = v
                corrected.append(best)
            else:
                corrected.append(suggestion)
        else:
            corrected.append(word)
    return corrected



if __name__ == "__main__":
    folder = "Corpus"  # put your folder name here
    data = read_corpus(folder)

    # build vocab from all processed words
    vocab = set()
    for doc in data.values():
        vocab.update(doc["processed"])

    # testing query correction
    query = "enviroment protecion"
    print("original query:", query)
    print("corrected query:", correct_query(query, vocab))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navu2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\navu2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navu2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



 Processing file: Adobe.txt

 Original text snippet: what is adobe?

The company was founded in 1982 by John Warn ...
Tokens: ['what', 'is', 'adobe', '?', 'the', 'company', 'was', 'founded', 'in', '1982', 'by', 'john', 'warnock', 'and', 'charles']
 After stopword removal: ['adobe', 'company', 'founded', '1982', 'john', 'warnock', 'charles', 'geschke', 'employed', 'xerox', 'corporation', 'palo', 'alto', 'california', 'research']
  After stemming: ['adob', 'compani', 'found', '1982', 'john', 'warnock', 'charl', 'geschk', 'employ', 'xerox', 'corpor', 'palo', 'alto', 'california', 'research']
  Example Soundex: ['A310', 'C515', 'F530', '1000', 'J500', 'W652', 'C640', 'G200', 'E514', 'X620', 'C616', 'P400', 'A430', 'C416', 'R262']

 Processing file: Amazon.txt

 Original text snippet: What is amazon?

Amazon.com, online retailer, manufacturer o ...
Tokens: ['what', 'is', 'amazon', '?', 'amazon.com', ',', 'online', 'retailer', ',', 'manufacturer', 'of', 'electronic', 'book', 'readers', ',']