In [2]:
import json
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
import re

# 加载 BERT 模型
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_keywords(text, top_n=5):
    # 简单分句（可用更好分词器替换）
    words = re.findall(r'\b\w+\b', text)
    unique_words = list(set(words))

    # 生成每个词的向量
    embeddings = model.encode(unique_words)
    
    # 聚类
    if len(unique_words) < top_n:
        return unique_words
    kmeans = KMeans(n_clusters=top_n)
    kmeans.fit(embeddings)
    
    # 取每个簇的中心词
    cluster_centers = kmeans.cluster_centers_
    closest_words = []
    for center in cluster_centers:
        distances = np.linalg.norm(embeddings - center, axis=1)
        closest_index = np.argmin(distances)
        closest_words.append(unique_words[closest_index])
    
    return closest_words

# 加载你的 JSON 数据
with open("papers_standardized.json", "r") as f:
    data = json.load(f)

for paper in data:
    abstract = paper.get("abstract", "")
    keywords = extract_keywords(abstract)
    print(f"Paper ID: {paper['paper_id']}")
    print(f"Title: {paper['title']}")
    print(f"Keywords: {keywords}")
    print("-" * 40)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Paper ID: 9203077
Title: Finite W-algebras
Keywords: ['versions', '2', 'reducible', 'In', 'is']
----------------------------------------
Paper ID: 9203063
Title: The Spectrum of Sl(2, R)/U(1) Black Hole Conformal Field Theory
Keywords: ['described', 'form', 'of', 'a', 'vertex']
----------------------------------------
Paper ID: 9212146
Title: Charged black holes in effective string theory
Keywords: ['hole', 'show', 'of', 'action', 'a']
----------------------------------------
Paper ID: 9204042
Title: Asymptotic Behavior of 2-d Black Holes
Keywords: ['large', 'field', 'the', 'with', 'consider']
----------------------------------------
Paper ID: 9210037
Title: Generalized Drinfeld-Sokolov Reductions and KdV Type Hierarchies
Keywords: ['approach', 'in', 'that', 'generated', 'a']
----------------------------------------
Paper ID: 9210023
Title: Infinite Braided Tensor Products and 2-D quantum Gravity
Keywords: ['that', 'in', 'group', 'also', 'used']
----------------------------------------

KeyboardInterrupt: 

In [4]:
from keybert import KeyBERT
import json

# 准备摘要数据（模拟你有100条）
with open("papers_standardized.json", "r") as f:
    data = json.load(f)

# 只提取前一百条摘要
abstracts = [paper.get("abstract", "") for paper in data[:10]]

# 拼接所有摘要
all_text = " ".join(abstracts)

# 初始化 KeyBERT 模型
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

# 提取关键词
keywords = kw_model.extract_keywords(
    all_text,
    top_n=20,
    stop_words='english',
    use_maxsum=True,
    nr_candidates=50
)

# 只提取关键词词汇部分
keyword_list = [kw[0] for kw in keywords]
keyword_list


KeyboardInterrupt: 