#### BART 모델 연습 (교재)<hr>
p.424

In [1]:
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# datasets 모듈의 뉴스 요약 데이터셋 사용

news= load_dataset("argilla/news-summary", split='test')
# 불러온 데이터를 text, prediction 열만 5000개 랜덤 추출
df= news.to_pandas().sample(5000, random_state=42)[['text', 'prediction']]
# ??  => 점수 부분 빼고 예측 요약만 추출
df['prediction']=df['prediction'].map(lambda x: x[0]['text'])
# 데이터 분리
train, valid, test= np.split(
    df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))]
)

len(train), len(test), len(valid)

(3000, 1000, 1000)

In [3]:
news['prediction'][:3]

[[{'score': 1.0,
   'text': "Trump ends 'Dreamer' immigration program, places onus on Congress"}],
 [{'score': 1.0,
   'text': 'Russian tycoon, fresh from jail, announces presidential bid'}],
 [{'score': 1.0,
   'text': 'U.S. not started assessment on any Trump intel disclosure to Russia: Coats'}]]

In [4]:
import torch
from transformers import BartTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence


## BART 토큰나이저를 통해 입력 텐서 생성 (전처리)
def make_dataset(data, tokenizer, device):
    # 토큰화
    tokenized= tokenizer(text= data.text.tolist(),
                         padding='longest',
                         truncation=True,
                         return_tensors='pt')
    labels=[]
    input_ids= tokenized['input_ids'].to(device)
    attention_mask= tokenized['attention_mask'].to(device)
    for target in data.prediction:
        labels.append(tokenizer.encode(target, return_tensors='pt').squeeze())
    labels= pad_sequence(labels, batch_first=True, padding_value=-100).to(device)
    return TensorDataset(input_ids, attention_mask, labels)
# 입력한 데이터 셋을 샘플링하여 데이터로더로 만들기기
def get_dataloader(dataset, sampler, batch_size):
    data_sampler= sampler(dataset)
    dataloader= DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader
# 파라미터 설정
epochs= 3
batch_size= 8
device= 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer= BartTokenizer.from_pretrained(pretrained_model_name_or_path="facebook/bart-base")
# 데이터 셋 및 데이터로더 생성
trainDS= make_dataset(train, tokenizer, device)
trainDL= get_dataloader(trainDS, RandomSampler, batch_size)

validDS= make_dataset(valid, tokenizer, device)
validDL= get_dataloader(validDS, SequentialSampler, batch_size)

testDS= make_dataset(valid, tokenizer, device)
testDL= get_dataloader(validDS, SequentialSampler, batch_size)

print(trainDS[0])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(tensor([   0,  495, 1889,  ...,    1,    1,    1]), tensor([1, 1, 1,  ..., 0, 0, 0]), tensor([    0, 35891,   161,    56,  5616, 10405,    19,   140,    23,  5490,
         3564,     2,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100]))


In [5]:
from torch import optim
# 질의응답과 같은 조건부 생성 작업에 특화된 모델
from transformers import BartForConditionalGeneration

# 모델 생성
model= BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path='facebook/bart-base').to(device)
optimizer= optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)



In [6]:
model.parameters
## BART 모델 구조
# 임베딩 -> 인코더(6개층) -> 디코더(6개층) -> layernorm_embeding

<bound method Module.parameters of BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=7

In [7]:
import evaluate   #허깅페이스의 생성형 모델 평가 기법

In [8]:
def calc_rouge(preds, labels):
    preds = preds.argmax(axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge2 = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    return rouge2["rouge2"]

def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        val_loss, val_rouge = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            logits = outputs.logits
            loss = outputs.loss

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            rouge = calc_rouge(logits, label_ids)
            
            val_loss += loss
            val_rouge += rouge

    val_loss = val_loss / len(dataloader)
    val_rouge = val_rouge / len(dataloader)
    return val_loss, val_rouge



In [9]:
# 학습
rouge_score = evaluate.load("rouge", tokenizer=tokenizer)
best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, trainDL)
    val_loss, val_accuracy = evaluation(model, validDL)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Rouge {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "../models/BartForConditionalGeneration.pt")
        print("Saved the model weights")

IndexError: index out of range in self

In [None]:
# 모델 평가
model= BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path='facebook/bart-base'
).to(device)
model.load_state_dict()