In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import time

In [2]:
def read_large_file(file_path):
    """파일에서 텍스트를 읽어 리스트로 반환."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

def cluster_texts(file_path, num_clusters=10):
    """텍스트를 클러스터링."""
    start_time = time.time()

    # 파일에서 텍스트 읽기
    texts = read_large_file(file_path)

    # TF-IDF 벡터화
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(texts)

    # KMeans 클러스터링
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10, max_iter=300)
    kmeans.fit(tfidf_matrix)

    # 클러스터 결과 저장
    clusters = kmeans.labels_
    clustered_texts = {i: [] for i in range(num_clusters)}
    for idx, label in enumerate(clusters):
        clustered_texts[label].append(texts[idx])

    elapsed_time = time.time() - start_time
    print(f"Text Clustering Completed in {elapsed_time:.2f} seconds")
    return clustered_texts

In [4]:
if __name__ == "__main__":
    input_file = 'AI 도입효과2.txt'
    num_clusters = 10  # 클러스터 개수 설정

    clustered_texts = cluster_texts(input_file, num_clusters)

    # 클러스터별 결과 저장
    with open('clustered_texts.txt', 'w', encoding='utf-8') as file:
        for cluster, texts in clustered_texts.items():
            file.write(f"Cluster {cluster}:\n")
            for text in texts:
                file.write(f"\t{text.strip()}\n")

    print("클러스터링 결과가 'clustered_texts.txt'에 저장되었습니다.")

Text Clustering Completed in 1.22 seconds
클러스터링 결과가 'clustered_texts.txt'에 저장되었습니다.
