In [None]:
%pip install transformers torch datasets evaluate rouge-score sentencepiece accelerate pandas matplotlib

In [None]:
from transformers import (
    pipeline,                          # 고수준 API - 가장 쉬운 방법
    AutoTokenizer,                     # 자동 토크나이저
    AutoModelForSeq2SeqLM,            # 자동 모델 로더 (Seq2Seq 작업용)
    T5TokenizerFast,                  # T5 전용 고속 토크나이저
    T5ForConditionalGeneration,       # T5 모델 클래스
    BartForConditionalGeneration,     # BART 모델 클래스
    PreTrainedTokenizerFast,          # 사전학습 토크나이저 기본 클래스
    DataCollatorForSeq2Seq,           # Seq2Seq 학습용 데이터 콜레이터
    Seq2SeqTrainingArguments,         # Seq2Seq 학습 하이퍼파라미터
    Seq2SeqTrainer,                   # Seq2Seq 전용 트레이너
)

In [None]:
# 경고메세지 숨김
import warnings
warnings.filterwarnings('ignore')
import os
import torch
import numpy as np
from datetime import datetime

In [None]:
# 데이터셋 라이브러리
from datasets import load_dataset

In [None]:
# 버전확인
print(f'pytorch 버전 : {torch.__version__}')
print(f"transformer 버전 :{__import__('transformers').__version__}")

In [None]:
# pipeline을 이용한 간단한 문서 요약
'''
1.모델 다운로드 및 로딩
2.토크나이제이션(문자->숫자)
3.모델추론(요약생성)
4.디코딩(숫자->문자)

장점:적은 코드로 실행가능(2~3줄)
단점:세밀한 제어 어려움

언제 :
  빠른 프로토타입 제작
  간단한 데모
  성능테스트
'''
summarizer = pipeline(
    'summarization',   # 작업유형
    model="facebook/bart-large-cnn",  # facebook 에서 개발한 요약모델 (생성형)
    device=0 if torch.cuda.is_available() else -1,
)
ARTICLE = """ The drama, emotion and shock of Congress’s sudden demand for transparency on the Jeffrey Epstein drama has ceded to uncertainty and suspicion that President Donald Trump will try to wriggle off the political hook.
The victims of Epstein, an accused sex trafficker, have tragically seen justice postponed before. They’d been quick to warn after an unprecedented Republican revolt against Trump on Tuesday that it was just one more step in their long quest for vindication. Their wisdom was obvious once the euphoria of their triumph began to clear.
Washington waited all day Wednesday to find out that Trump had signed the bill passed with a 427-1 majority in the House and waved through on unanimous consent by the Senate, which had no desire to be drawn into the president’s self-inflicted political crisis. The measure requires the Justice Department to release all investigative files, documents and other materials about Epstein, with some exceptions, within 30 days of it becoming law. It’s hardly surprising that Trump didn’t rush to sign it, since, despite his late embrace of the legislation, it repudiated his weekslong bid to keep the evidence secret.
Trump was in a box. Any presidential veto was likely to be overturned, judging by the congressional majorities for the bill. But now his Justice Department — in theory at least — is bound to implement it. Any foot-dragging or obstruction would certainly worsen already disastrous public disapproval of his handling of the issue and would only fuel the question that got him into his current mess: What is he trying to hide?
"""
summary_result = summarizer(ARTICLE,
                            max_length=130,   # 요약문 최대 길이
                            min_length=30,    # 요약문 최소 길이
                            do_sample=False   # 매번 동일한 결과(재현성)
                            )
print(summary_result[0]['summary_text'])


In [None]:
summary_text = summary_result[0]['summary_text']
# 통계분석
compression_ratio = len(summary_text) / len(ARTICLE) *100
word_reduced = len(ARTICLE.split()) - len(summary_text.split())
print(f'원문 길이 : {len(ARTICLE):4d} 문자 ({len(ARTICLE.split()):3d} 단어)')
print(f'요약문 길이 : {len(summary_text):4d} 문자 ({len(summary_text.split()):3d} 단어)')
print(f'압축률 : {compression_ratio:.1f}%')
print(f'단어감소 : {word_reduced} 단어 절감')


In [None]:
# T5 모델과 AutoModel을 이용한 문서 요약 Goole 2019년
# 모든 nlp작업을 텍스트 -> 텍스트 형식으로 통일
# 요약 : "summarize: [원문] -> [요약문]"
# 번역 : "translate English to French: [원문] -> [번역문]"
# 분류 : "sentiment [원문] -> [클래스]"  i love this! -> positive

# 단점 : Task Prefix 필수(없으면 급격한 성능 저하)  --> 위의 예시처럼 문제의 유형을 알려주는
                                                          # 짧은 문장을 입력앞에 붙임
MODEL_NAME = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# gpu 이동(가능하면)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# 전처리
#1. 공백제거
#2.줄바꿈을 공백으로 변환(모델은 줄바꿈을 잘 처리 못함)
#3.Task Prefix 추가  **필수**
preprocess_text = ARTICLE.strip().replace('\n'," ")
input_text = f"summarize: {preprocess_text}"

# 토크나이제이션
tokenized_text = tokenizer.encode(
    input_text,return_tensors='pt',truncation=True,max_length=512
).to(device)

print(f'토큰수 : {tokenized_text.shape[1]}')
print(f'텐서형태 : {tokenized_text.shape} (배치크기=1, 시퀀스길이={tokenized_text.shape[1]})')
print(f'첫 10개토큰 : {tokenized_text[0][:10].tolist()}')

# 디코딩(숫자 -> 텍스트)
decoded_sample = tokenizer.decode(tokenized_text[0][:20],skip_special_tokens=False)
print(f'첫 210개토큰 디코딩 : {decoded_sample}')

In [None]:
# 요약생성  Beam Search 사용
# 여러 가능성을 동시에 탐색하면서 최적의 요약 찾기
# beam=4, min_length=30,max_length=100
summary_ids =  model.generate(
    tokenized_text,
    num_beams=4,
    no_repeat_ngram_size=3,  # 3-gram 반복 방지
    min_length = 30,
    max_length = 100,
    early_stopping = True  # EOS 만나면 종료
)
# 생성완료
# 디코딩
output =  tokenizer.decode(summary_ids[0],skip_special_tokens=True)
print(f'요약 통계')
print(f'원문의 길이 : {len(ARTICLE)} 문자')
print(f'요약문의 길이 : {len(output)} 문자')
print(f'압축률 : {len(output) / len(ARTICLE)*100:.1f}%')
print(f'생성 토큰수 : {summary_ids.shape[1]} 개')
print(f'요약문 : {output}')

In [None]:
# T5 모델
# Task prefix
#