Stemming

In [10]:


import nltk
from nltk.stem import PorterStemmer

# Tải bộ tokenizer cần thiết
nltk.download("punkt")

# Khởi tạo Porter Stemmer
ps = PorterStemmer()

# Danh sách từ để thử nghiệm
example_words = ["program", "programming", "programer", "programs", "programmed"]

# Thực hiện stemming
print("{0:20}{1:20}".format("--Word--", "--Stem--"))
for word in example_words:
    print("{0:20}{1:20}".format(word, ps.stem(word)))


--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Stopping

In [11]:
import nltk
from nltk.corpus import stopwords

# Tải stopwords
nltk.download('stopwords')

# Danh sách stopwords tiếng Anh
stop_words = set(stopwords.words('english'))

# Hàm loại bỏ stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Văn bản thử nghiệm
text = "This is an example of query transformation and refinement in NLP."

# Loại bỏ stopwords
filtered_text = remove_stopwords(text)

print("Original Text:", text)
print("Without Stopwords:", filtered_text)


Original Text: This is an example of query transformation and refinement in NLP.
Without Stopwords: example query transformation refinement NLP.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
!pip install pyspellchecker --quiet


Spellchacker

In [13]:

from spellchecker import SpellChecker

# Khởi tạo SpellChecker
spell = SpellChecker()

# Hàm kiểm tra và sửa lỗi chính tả
def correct_spelling(text):
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return ' '.join(corrected_words)

# Văn bản thử nghiệm
text = "Ths is an exmple of query preprocesing and refinment"

# Kiểm tra lỗi chính tả
corrected_text = correct_spelling(text)

print("Original Text:", text)
print("Corrected Text:", corrected_text)


Original Text: Ths is an exmple of query preprocesing and refinment
Corrected Text: the is an example of query reprocessing and refinement


In [14]:
from spellchecker import SpellChecker

# Khởi tạo SpellChecker
spell = SpellChecker()

# Hàm đưa ra gợi ý sửa lỗi chính tả
def suggest_corrections(text):
    words = text.split()
    suggestions = {word: spell.candidates(word) for word in words if word not in spell}
    return suggestions

# Văn bản thử nghiệm với lỗi chính tả
text = "Ths is an exmple of query refinment and preprocesing"

# Lấy danh sách từ gợi ý
suggestions = suggest_corrections(text)

print("Original Text:", text)
print("Suggestions for misspelled words:")
for word, suggested_words in suggestions.items():
    print(f"- {word}: {suggested_words}")


Original Text: Ths is an exmple of query refinment and preprocesing
Suggestions for misspelled words:
- Ths: {'ohs', "t's", 'ts', "th's", 'th', 'thy', 'thus', 'the', 'hts', 'this', 'tho', 'thu'}
- exmple: {'example'}
- refinment: {'refinement'}
- preprocesing: {'reprocessing'}


query expansion


In [17]:
import nltk
from nltk.corpus import wordnet

# Tải dữ liệu WordNet
nltk.download('wordnet')
nltk.download('omw-1.4')

# Hàm mở rộng truy vấn bằng cách lấy từ đồng nghĩa
def expand_query(query):
    expanded_terms = set()
    words = query.split()

    for word in words:
        synonyms = wordnet.synsets(word)  # Lấy danh sách từ đồng nghĩa
        for syn in synonyms:
            for lemma in syn.lemmas():
                expanded_terms.add(lemma.name())  # Lấy tất cả các từ đồng nghĩa

    return ' '.join(expanded_terms)

# Câu truy vấn ban đầu
query = "buy a car"

# Mở rộng truy vấn
expanded_query = expand_query(query)

print("Original Query:", query)
print("Expanded Query:", expanded_query)


Original Query: buy a car
Expanded Query: A car ampere bargain motorcar grease_one's_palms deoxyadenosine_monophosphate railroad_car antiophthalmic_factor purchase angstrom steal gondola machine vitamin_A automobile bribe type_A corrupt a elevator_car cable_car angstrom_unit axerophthol group_A railcar amp auto buy adenine railway_car


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
