In [1]:
!pip install networkx scikit-learn



In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx  # (pip install networkx)
import numpy as np


In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
# --- 2. 텍스트 전처리 헬퍼 함수 (2-2 재사용) ---
def preprocess_for_vectorizing(text):
  """TF-IDF 벡터화를 위한 전처리 함수"""
  tokens = word_tokenize(text.lower())
  filtered_tokens = [
    word for word in tokens
    if word.isalpha() and word not in stop_words
  ]
  # 벡터라이저는 토큰 리스트가 아닌 '공백으로 연결된 문자열'을 받음
  return " ".join(filtered_tokens)

In [6]:
# --- 3. 텍스트 요약기 함수 (TextRank) ---

def summarize_text_textrank(text, top_n=3):
  """
  TextRank(PageRank) 알고리즘을 사용하여 텍스트를 요약합니다.
  """

  # 1. 텍스트 -> 문장 리스트로 분리 (원본 문장 유지)
  original_sentences = sent_tokenize(text)

  if len(original_sentences) <= top_n:
    return text # 요약할 필요가 없음

  # 2. 각 문장을 TF-IDF 벡터화를 위해 전처리
  processed_sentences = [preprocess_for_vectorizing(s) for s in original_sentences]

  # 3. TF-IDF 벡터화 (2-2의 GloVe 대신 사용, 더 간편함)
  vectorizer = TfidfVectorizer()
  try:
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)
  except ValueError:
    # 모든 문장이 불용어로만 구성된 경우
    return ""

  # 4. 문장 간 코사인 유사도 행렬 생성 (2-2 응용)
  #
  # (N x N 크기의 행렬, N=문장 수)
  # 이 행렬이 바로 문장 간의 '그래프(Graph)'입니다.
  similarity_matrix = cosine_similarity(tfidf_matrix)

  # 5. 유사도 행렬을 NetworkX 그래프로 변환
  G = nx.from_numpy_array(similarity_matrix)

  # 6. PageRank 알고리즘 적용 (문장 중요도 점수 계산)
  #
  # "링크가 많은 페이지" -> "유사도가 높은 문장"
  scores = nx.pagerank(G) # {0: 0.05, 1: 0.12, ...}

  # 7. 점수(score) 기준으로 상위 N개 문장 인덱스 추출
  ranked_sentences = sorted(
    ((scores[i], s, i) for i, s in enumerate(original_sentences)),
    reverse=True
  )
  # (score, "원본 문장", original_index) 튜플의 리스트

  top_sentence_tuples = ranked_sentences[:top_n]

  # 8. (중요) 요약문 생성: 상위 N개 문장을 '원본 순서대로' 재정렬
  # (점수 순으로 나열하면 요약문의 흐름이 엉망이 됨)
  top_sentence_tuples_sorted_by_index = sorted(
    top_sentence_tuples,
    key=lambda x: x[2] # original_index(x[2]) 기준 정렬
  )

  # 9. 최종 요약문 반환
  summary = " ".join([s for score, s, i in top_sentence_tuples_sorted_by_index])
  return summary

In [7]:


# --- 4. "응용 프로그램" 테스트 ---

# 요약 테스트를 위한 긴 예제 텍스트
example_long_text = """
Natural Language Processing (NLP) is a dynamic subfield of artificial intelligence (AI)
that focuses on the interaction between computers and human language.
The ultimate goal of NLP is to enable computers to process, understand, and generate
human language in a way that is both meaningful and valuable.
NLP has many real-world applications, including machine translation,
sentiment analysis, and virtual assistants like Siri and Alexa.
Core tasks in NLP involve tokenization, parsing, and stop-word removal,
which are foundational steps. More advanced techniques rely on machine learning
and deep learning models. For example, Recurrent Neural Networks (RNNs) were
popular for sequence tasks, but Transformers have now become the industry standard.
These models are incredibly large but provide state-of-the-art results.
The field continues to evolve rapidly, driving innovation across many sectors.
"""

print("--- 원본 텍스트 ---")
print(example_long_text)
print("-" * 30)

--- 원본 텍스트 ---

Natural Language Processing (NLP) is a dynamic subfield of artificial intelligence (AI) 
that focuses on the interaction between computers and human language. 
The ultimate goal of NLP is to enable computers to process, understand, and generate 
human language in a way that is both meaningful and valuable. 
NLP has many real-world applications, including machine translation, 
sentiment analysis, and virtual assistants like Siri and Alexa. 
Core tasks in NLP involve tokenization, parsing, and stop-word removal, 
which are foundational steps. More advanced techniques rely on machine learning 
and deep learning models. For example, Recurrent Neural Networks (RNNs) were 
popular for sequence tasks, but Transformers have now become the industry standard. 
These models are incredibly large but provide state-of-the-art results. 
The field continues to evolve rapidly, driving innovation across many sectors.

------------------------------


In [8]:

# 2문장으로 요약 요청
summary = summarize_text_textrank(example_long_text, top_n=2)

print(f"--- 2문장 요약 결과 ---")
print(summary)

--- 2문장 요약 결과 ---
The ultimate goal of NLP is to enable computers to process, understand, and generate 
human language in a way that is both meaningful and valuable. NLP has many real-world applications, including machine translation, 
sentiment analysis, and virtual assistants like Siri and Alexa.
