In [12]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
# import nltk
# nltk.download('wordnet')

# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatizing(data_samples_raw):
    lemmatizer = WordNetLemmatizer()
    data_samples_cleaned=[]
    for ind_doc in tqdm(range(0,len(data_samples_raw))):  # 使用 tqdm 显示进度百分比
        sentence=data_samples_raw[ind_doc].lower()
        tokens = word_tokenize(sentence)  # 分词
        tagged_sent = pos_tag(tokens)     # 获取单词词性
        lemmas_sent = []
        for tag in tagged_sent:
            wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            lemmas_sent.append(lemmatizer.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
    #     data_samples_cleaned.append(lemmas_sent)
        data_samples_cleaned.append(' '.join(lemmas_sent))
    return(data_samples_cleaned)

In [9]:
import os
def find_data_directory(file_name):
    current_directory = os.getcwd()
    parent_directory = os.path.dirname(current_directory)

    data_directory = None  # 初始化data文件夹地址为None

    for file_or_dir in os.listdir(parent_directory):
        full_path = os.path.join(parent_directory, file_or_dir)  # 获取文件或文件夹的完整路径
        if os.path.isdir(full_path) and file_or_dir == file_name:
            data_directory = full_path  # 如果是文件夹并且名称为"data"，则将其地址赋值给data_directory变量
            break  # 找到了就退出循环

    if data_directory:
        print("找到了名为",file_name,"的文件夹，地址为：", data_directory)
        return data_directory
    else:
        print("找到了名为",file_name,"的文件夹！")
        return None
# print([find_data_directory('Data')+"\\IEEE\\Abstract"])

In [10]:
"""
=======================================================================================
Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation
=======================================================================================

This is an example of applying Non-negative Matrix Factorization
and Latent Dirichlet Allocation on a corpus of documents and
extract additive models of the topic structure of the corpus.
The output is a list of topics, each represented as a list of terms
(weights are not shown).

The default parameters (n_samples / n_features / n_topics) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).
"""

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
# from sklearn.datasets import fetch_20newsgroups

# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
# dataset = fetch_20newsgroups(shuffle=True, random_state=1,
#                              remove=('headers', 'footers', 'quotes'))
# data_samples = dataset.data
import os

data_samples_raw=[]
for dirpath, dirnames, filenames in os.walk(find_data_directory('Data')+"\\IEEE\\Abstract"):
#     print('Directory', dirpath)
    for filename in filenames:
#         print(' File', filename)
        try:
            dir=os.path.join(dirpath, filename)
            f = open(dir,'r',encoding='utf-8')    # 打开文件
#             data = f.readline()                   # 读取文件内容
#             print(' File', data)
            lines = f.readlines()  # 读取所有行
#             print(type(lines))
            data_samples_raw.append(lines[0])  # 取第4行为abstract
#             last_line = lines[-1]  # 取最后一行为keywords
            
        finally:
            if f:
                f.close()                     # 确保文件被关闭
# print(' File', data_samples)


print("done in %0.3fs." % (time() - t0))



Loading dataset...
找到了名为 Data 的文件夹，地址为： E:\CDNMF\Experiment\Data
done in 0.186s.


In [13]:
data_samples=lemmatizing(data_samples_raw)

100%|█████████████████████████████████████████████████████████████████████████████| 2058/2058 [00:17<00:00, 116.14it/s]


In [17]:
n_samples = len(data_samples)
print(n_samples)


2058


In [36]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
n_samples = len(data_samples)
n_features = 6000
n_topics = 18
n_top_words = 20
def load_stopwords( inpath = find_data_directory('text')+"\\stopwords_yy20220521.txt" ):
	"""
	Load stopwords from a file into a set.
	"""
	stopwords = set()
	with open(inpath, 'r', encoding='utf-8') as f:
		lines = f.readlines()
		for l in lines:
			l = l.strip()
			if len(l) > 0:
				stopwords.add(l)
	return stopwords

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words=load_stopwords(), ngram_range = (1,3))
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(init='nndsvda', n_components=n_topics, random_state=0, alpha_W=0,alpha_H=0, l1_ratio=1).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print_top_words(nmf, tfidf_feature_names, n_top_words)

找到了名为 text 的文件夹，地址为： E:\CDNMF\Experiment\text
Extracting tf-idf features for NMF...
done in 1.136s.
Fitting the NMF model with tf-idf features,n_samples=2058 and n_features=6000...
done in 0.959s.

Topics in NMF model:
Topic #0:
controller power uncertainty mode output input loop simulation plant subsystem track effectiveness logic grid actuator base operation feedback delay closed loop
Topic #1:
robot mobile mobile robot localization environment task map multirobot robotic formation leader move robot move communication human robot task robot cycle perform robot motion skin
Topic #2:
1998 1998 1999 1999 italic italic 1998 1999 italic 1998 1999 1998 1999 1998 1999 1998 1999 1998 article formulatype formulatype 1998 formulatype 1998 1999 observation para notion game notion 1998 1999 notion 1998 norm
Topic #3:
wafer tool cluster tool cluster schedule cycle chamber lot wafer delay delay armed processing wafer lot armed cluster armed cluster tool cyclic manufacturing backward variation wafe

In [37]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words=load_stopwords())
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf features for LDA...
done in 0.768s.
Fitting LDA models with tf features, n_samples=2058 and n_features=6000...
done in 2.535s.

Topics in LDA model:
Topic #0:
calibration camera image afm chiller measurement scan drift nanostructure vo imu instrument scene helical band odometry deflection neck monocular microswimmers
Topic #1:
leader agent game follower auction formation mechanism slip parking rotation price player locomotion homogeneous sand procurement vcg bipedal volume phase
Topic #2:
policy schedule disassembly price service cloud customer rbc eol job minimize inventory base electricity park provider formulate spike makespan charge
Topic #3:
phase volume micro successful cubic perturbation able highly meso delivery robotic cell perishable intervention supply transport automatic produce protein constructive
Topic #4:
grasp hand gait object grasping exoskeleton walk finger tolerance suture manipulation robotic limb rehabilitation hip assistance cage contact walking for

In [40]:
import os
def find_root_directory():
    current_directory = os.getcwd()
    root_directory = current_directory

    while os.path.dirname(root_directory) != root_directory:
        root_directory = os.path.dirname(root_directory)

    print("当前工作目录的根目录为：", root_directory)
    return root_directory
find_root_directory()

当前工作目录的根目录为： E:\


'E:\\'