In [2]:
import nltk
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# 下载NLTK数据
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # 分词
    words = word_tokenize(text)
    # 去除停用词和非字母字符
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return words

def compute_tf(word_counts):
    tf_values = {}
    total_words = sum(word_counts.values())
    for word, count in word_counts.items():
        tf_values[word] = count / total_words
    return tf_values

def compute_idf(corpus, words):
    idf_values = {}
    num_documents = len(corpus)
    
    for word in words:
        # 计算包含特定词汇的文档数
        doc_count = sum(1 for doc in corpus if word in doc)
        # 使用平滑处理，避免除零错误
        idf_values[word] = math.log(num_documents / (1 + doc_count)) + 1
    
    return idf_values

def compute_tfidf(tf_values, idf_values):
    tfidf_values = {}
    for word, tf in tf_values.items():
        tfidf_values[word] = tf * idf_values[word]
    return tfidf_values

def extract_top_keywords(text, top_n=10):
    # 预处理文本
    words = preprocess_text(text)
    # 计算词频
    word_counts = Counter(words)
    # 构建语料库（仅包含当前文档）
    corpus = [words]
    # 计算 TF 值
    tf_values = compute_tf(word_counts)
    # 计算 IDF 值
    idf_values = compute_idf(corpus, word_counts.keys())
    # 计算 TF-IDF 值
    tfidf_values = compute_tfidf(tf_values, idf_values)
    # 选取 TF-IDF 值最高的前 top_n 个词
    top_keywords = sorted(tfidf_values.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return top_keywords

# 测试示例
text = """
My recent high tea experience at The Fullerton Hotel Singapore was nothing short of exceptional. 
From the moment I stepped into the historic hotel, I was greeted with an ambiance of elegance and sophistication. 
The beautifully restored colonial architecture, combined with modern luxury, set the perfect tone for an indulgent afternoon. 
The high tea spread was a delightful assortment of both traditional and contemporary offerings. 
Each item was meticulously crafted and presented, showcasing the culinary expertise of the hotel's chefs. 
The savory selections, from delicate finger sandwiches to exquisite quiches, were bursting with flavor and freshness. 
The sweet treats were equally impressive, with an array of pastries, scones, and cakes that were as delicious as they were visually stunning. 
The freshly brewed teas, with a wide variety of blends to choose from, complemented the food perfectly. 
A special mention must be made of Joey, one of the service crew members, who truly elevated our experience. 
Joey's attention to detail, warmth, and professionalism were evident throughout our visit. 
He was attentive without being intrusive, ensuring that our teapots were always filled and that we had everything we needed. 
Joey's knowledge of the menu and his recommendations were spot-on, enhancing our enjoyment of the high tea session. 
Joey's exceptional service, combined with the luxurious setting and exquisite food, made this high tea experience at The Fullerton Hotel truly memorable. 
I highly recommend it to anyone looking to indulge in a sophisticated and delightful afternoon
"""

top_keywords = extract_top_keywords(text)
print("TF-IDF 最重要的十个词：")
for keyword, score in top_keywords:
    print(f"{keyword}: {score:.4f}")

TF-IDF 最重要的十个词：
high: 0.0088
tea: 0.0088
hotel: 0.0088
joey: 0.0088
experience: 0.0066
fullerton: 0.0044
exceptional: 0.0044
combined: 0.0044
afternoon: 0.0044
delightful: 0.0044


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heyh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heyh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
