In [6]:
import time
from multiprocessing import Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
def read_large_file(file_path):
    """파일에서 텍스트를 읽어 리스트로 반환."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()


In [8]:
def calculate_similarity(chunk):
    """주어진 텍스트 조각에 대해 유사도를 계산."""
    indices, texts = chunk
    vectorizer = TfidfVectorizer().fit_transform(texts)
    similarity_matrix = cosine_similarity(vectorizer)
    results = []
    for i in range(len(texts)):
        for j in range(i + 1, len(texts)):
            if similarity_matrix[i, j] > 0.8:  # 유사도가 0.8 이상인 경우
                results.append((indices[i], indices[j], similarity_matrix[i, j]))
    return results

In [9]:
def process_in_chunks(file_path, chunk_size):
    """파일을 청크 단위로 분할."""
    texts = read_large_file(file_path)
    num_texts = len(texts)
    chunks = [
        (list(range(i, min(i + chunk_size, num_texts))),
         texts[i:min(i + chunk_size, num_texts)])
        for i in range(0, num_texts, chunk_size)
    ]
    return chunks

In [10]:
def test_chunk_size(file_path, chunk_sizes):
    """다양한 청크 크기에 대해 실행 시간을 측정."""
    results = []
    for chunk_size in chunk_sizes:
        print(f"Testing chunk size: {chunk_size}")
        start_time = time.time()

        # 청크 생성
        chunks = process_in_chunks(file_path, chunk_size)

        # 멀티프로세싱을 통한 유사도 계산
        with Pool() as pool:
            pool.map(calculate_similarity, chunks)

        elapsed_time = time.time() - start_time
        print(f"Chunk size {chunk_size}: {elapsed_time:.2f} seconds")
        results.append((chunk_size, elapsed_time))

    return results

In [None]:
def main():
    input_file = 'AI 도입효과2.txt'  # 입력 텍스트 파일 경로
    chunk_sizes = [100, 500, 1000, 2000, 5000, 10000]  # 테스트할 청크 크기 목록

    # 청크 크기별 실행 시간 측정
    results = test_chunk_size(input_file, chunk_sizes)

    # 결과 출력
    print("\nPerformance Results:")
    for chunk_size, elapsed_time in results:
        print(f"Chunk size {chunk_size}: {elapsed_time:.2f} seconds")

    # 결과 시각화
    try:
        import matplotlib.pyplot as plt
        chunk_sizes, times = zip(*results)
        plt.plot(chunk_sizes, times, marker='o')
        plt.xlabel('Chunk Size')
        plt.ylabel('Execution Time (seconds)')
        plt.title('Chunk Size vs Execution Time')
        plt.grid()
        plt.show()
    except ImportError:
        print("Matplotlib is not installed. Skipping visualization.")

if __name__ == "__main__":
    main()

Testing chunk size: 100
