In [52]:
import pandas as pd
import numpy as np
import re
from bertopic import BERTopic
import spacy
from sentence_transformers import SentenceTransformer
import altair as alt
from ugtm import eGTM
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
import nltk
from itertools import product
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.preprocessing import normalize
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel


[nltk_data] Downloading package punkt to /Users/woosu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/woosu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/woosu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/woosu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# NLTK 설정 및 리소스 다운로드
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [53]:

# 데이터 불러오기
csv_file_path = "/Users/woosu/Desktop/project/port_patent_data.csv"
df = pd.read_csv(csv_file_path)
df['sum'] = df['sum'].fillna('')  # 누락된 값을 빈 문자열로 대체
text_data = df['sum'].astype(str)


In [54]:
# 불용어 목록 설정
stop_words = set(stopwords.words('english'))  # 영어 불용어 로드
stop_words.update(["first", "may", "one", "second"])
stop_words = list(stop_words)  # set을 list로 변환

# 표제어 추출기 설정
lemmatizer = WordNetLemmatizer()


In [55]:
def wordnet_pos_tags(treebank_tag):
    """Converts POS tags from treebank format to WordNet format."""
    if treebank_tag.startswith('J'):
        return 'a'  # adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # verb
    elif treebank_tag.startswith('N'):
        return 'n'  # noun
    elif treebank_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun


In [56]:
def preprocess_for_bertopic(documents):
    lemmatizer = WordNetLemmatizer()

    preprocessed_docs = []
    for document in documents:
        # 소문자 변환 및 특수 문자 제거
        document = document.lower()
        document = re.sub(r'\s+', ' ', document)
        document = document.strip()
        
        # 토크나이징 및 불용어 제거
        tokens = word_tokenize(document)
        tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        
        # 품사 태깅 및 표제어 추출
        pos_tags = pos_tag(tokens)
        lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(tag)) for token, tag in pos_tags]
        
        # 전처리된 문서를 리스트에 추가
        preprocessed_docs.append(" ".join(lemmatized_tokens))
    return preprocessed_docs


In [57]:
# 데이터 전처리
preprocessed_docs = preprocess_for_bertopic(text_data)

In [104]:
# 커스텀 임베딩 모델
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 커스텀 UMAP 모델
umap_model = umap.UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.01,
    spread=1.0
)
# 커스텀 HDBSCAN 모델
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=5,
    metric='euclidean',
    prediction_data=True,
    min_samples=5,
    alpha=1.0
)
# CountVectorizer 설정
vectorizer_model = CountVectorizer(
    stop_words=stop_words,
    ngram_range=(1, 1)
)

# BERTopic 모델 초기화 및 훈련
topic_model = BERTopic(
    language="english",  # 언어 설정
    calculate_probabilities=True,  # 확률 계산 여부
    nr_topics=10,  # 주제의 수 제한
    top_n_words=10,  # 각 주제의 상위 단어 수
    min_topic_size=5,  # 주제의 최소 크기
    vectorizer_model=vectorizer_model,  # 벡터화 모델
    embedding_model=embedding_model,  # 임베딩 모델
    umap_model=umap_model,  # UMAP 모델
    hdbscan_model=hdbscan_model,  # HDBSCAN 모델
    ctfidf_model=None,  # c-TFIDF 모델
    verbose=True  # 진행 상황 출력 여부
)


In [105]:

# BERTopic 모델 훈련
topics, probabilities = topic_model.fit_transform(preprocessed_docs)


2024-05-30 11:10:32,873 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/48 [00:00<?, ?it/s]

2024-05-30 11:10:35,410 - BERTopic - Embedding - Completed ✓
2024-05-30 11:10:35,410 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-30 11:10:37,376 - BERTopic - Dimensionality - Completed ✓
2024-05-30 11:10:37,377 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-30 11:10:37,546 - BERTopic - Cluster - Completed ✓
2024-05-30 11:10:37,546 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-30 11:10:37,641 - BERTopic - Representation - Completed ✓
2024-05-30 11:10:37,641 - BERTopic - Topic reduction - Reducing number of topics
2024-05-30 11:10:37,718 - BERTopic - Topic reduction - Reduced number of topics from 71 to 10


In [106]:
# 토픽 정보 출력
topic_info = topic_model.get_topic_info()
print(topic_info)

   Topic  Count                                     Name  \
0     -1    577          -1_system_data_container_device   
1      0    350             0_network_device_data_system   
2      1    248             1_item_vehicle_method_system   
3      2    195         2_container_cargo_include_system   
4      3     90                3_vessel_marine_ship_sail   
5      4     25           4_image_container_include_form   
6      5     16  5_radiation_source_detector_quasistatic   
7      6     15        6_carrier_aspect_compute_waveform   
8      7      6                 7_shaft_actuate_lock_fit   
9      8      5             8_light_optical_object_lidar   

                                      Representation  \
0  [system, data, container, device, include, met...   
1  [network, device, data, system, signal, includ...   
2  [item, vehicle, method, system, include, deliv...   
3  [container, cargo, include, system, lock, sens...   
4  [vessel, marine, ship, sail, system, position,...   
5  

In [107]:
for topic_num in sorted(topic_model.get_topics()):
    words = [word for word, _ in topic_model.get_topic(topic_num)]
    print(f"Topic {topic_num}: {' '.join(words)}")


Topic -1: system data container device include method information item use provide
Topic 0: network device data system signal include information method wireless communication
Topic 1: item vehicle method system include delivery data order storage provide
Topic 2: container cargo include system lock sensor control device position door
Topic 3: vessel marine ship sail system position image control include least
Topic 4: image container include form layer develop toner unit print map
Topic 5: radiation source detector quasistatic detect field neutron container ionize radioactive
Topic 6: carrier aspect compute waveform srgb ue transmission slice prefix synchronization
Topic 7: shaft actuate lock fit rotate corner strap clamp pawl unit
Topic 8: light optical object lidar head pulse scatter configure portion include


In [266]:
# 문서별 할당된 토픽
doc_topics = pd.DataFrame({"Document": text_data, "Topic": topics})
print(doc_topics)

                                               Document  Topic
0     The present specification discloses systems an...      6
1     An inspection system based upon an imaging enc...     10
2     A worldwide logistics network includes a proce...      0
3     Mobile collection and vetting of user supplied...      0
4     Systems and methods can secure freight contain...      5
...                                                 ...    ...
1522  The present disclosure is directed to systems ...     -1
1523  The present disclosure relates generally to me...     -1
1524  A system for monitoring objects and individual...      1
1525  Provided are an active radio frequency identif...      7
1526  A method used in the acquisition of a voice si...     -1

[1527 rows x 2 columns]
