In [None]:
# !pip install lda,pandas,nltk,gensim,numpy

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models, matutils
import nltk
import numpy as np

In [2]:
np.random.seed(42) # Random하게 나오는 LDA 결과 고정
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
data = pd.read_csv("data.csv", header=0)

In [15]:
data.head()

Unnamed: 0,번호,전체 청구항
0,US9276168,1. A component comprising a substrate comprisi...
1,US9276048,"1. A method of detecting infrared (IR), compri..."
2,US9274264,"1. A light source module, comprising:a light g..."
3,US9273851,"1. A backlight module, which comprises:a diffu..."
4,US9260655,1. A composition comprising:(a) a quantum dot ...


In [5]:
# 말뭉치 생성
corpus = data['전체 청구항'].tolist()

In [6]:
# 불용어 제거, 한 글자 단어 제거
stop_words = set(stopwords.words('english'))
corpus = [' '.join([word for word in word_tokenize(sentence.lower()) if len(word) > 1 and word.isalpha() and word not in stop_words]) for sentence in corpus]

In [7]:
# Term-Doc Matrix 생성
vectorizer = CountVectorizer(max_features=1000)
dtm = vectorizer.fit_transform(corpus)

In [8]:
# DTM을 Corpus형식으로 변환
corpus_gensim = matutils.Sparse2Corpus(dtm.transpose())

In [9]:
num_topics = 50  # 주제의 수 설정
num_iterations = 500  # 반복 횟수 설정
burnin = 100  # burn-in 기간 설정
alpha = 0.01  # alpha 설정
eta = 0.01  # eta 설정

In [10]:
# LDA Model 생성
lda_model = models.LdaModel(corpus_gensim, num_topics=num_topics, iterations=num_iterations, alpha=alpha, eta=eta, id2word=dict(enumerate(vectorizer.get_feature_names_out())))

In [11]:
# 결과
topic_keywords = []

for topic_idx in range(num_topics):
    top_keywords = lda_model.show_topic(topic_idx, topn=len(vectorizer.get_feature_names_out()))
    keywords = [word for word, _ in sorted(top_keywords, key=lambda x: x[1], reverse=True)]  # Sorting in descending order
    topic_keywords.append(keywords)

In [12]:
# 결과를 DataFrame으로 저장
topic_word = pd.DataFrame(topic_keywords).T
topic_word.columns = [f'Topic {i + 1}' for i in range(num_topics)]

In [14]:
topic_word.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 41,Topic 42,Topic 43,Topic 44,Topic 45,Topic 46,Topic 47,Topic 48,Topic 49,Topic 50
0,wherein,layer,light,group,particle,wherein,said,claim,light,layer,...,said,data,layer,claim,claim,second,claim,wherein,tunneling,metal
1,group,said,wherein,quantum,light,layer,light,wherein,first,light,...,semiconductor,processing,wherein,wherein,wherein,first,wherein,claim,method,claimed
2,claim,step,layer,compound,first,claim,claim,light,wherein,claim,...,quantum,first,claim,said,layer,material,said,method,producing,method
3,nanoparticle,semiconductor,claim,dots,second,first,wherein,first,second,quantum,...,layer,wherein,method,one,cell,light,method,light,vertical,claim
4,comprises,film,emitting,mixture,claim,second,method,system,claim,wherein,...,claim,second,electrode,light,solar,claim,layer,said,transistor,solution


In [13]:
# 결과를 CSV 파일로 저장
topic_word.to_csv('Topic word.csv', index=False)