In [None]:
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentences = [
    'Hello, world!',
    'unbelievable performance!',
    'COVID-19 pandemic'
]
for sentence in sentences:
   # 토큰화
   tokens =  tokenizer.tokenize(sentence)
   print(f'원문 : {sentence}')
   print(f'토큰 : {tokens}')

   #ID 변환
   ids = tokenizer.convert_tokens_to_ids(tokens)
   print(f'ID : {ids}')

   # 역변환
   decoded_string = tokenizer.decode(ids)
   print(f'역변환 : {decoded_string}\n')

In [None]:
# 2. Attention Mask : 실제단어 1  , 패딩은 0
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentences = [
    'short sentence',
    'Thiis is a much longer sentence with more words'
]
# 여러문장을 한꺼번에 토크나이징하고 가장 긴 문장길이에 맞춰 자동 패딩 수행
encoded = tokenizer(
    sentences,
    padding=True,
    return_tensors='pt'
)
encoded


In [None]:
# token_type_ids : 두 문장을 입력할때 첫번째 ,두번째 구분
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sentence_A = "The weather is nice"
sentence_B = "Let's go for a walk"
# 두 문장을 하나의 입력으로 인코딩
encoded = tokenizer(
    sentence_A,
    sentence_B,
    padding=True,
    return_tensors="pt"
)
tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][0])
for token,token_id, type_id in zip(tokens, encoded["input_ids"][0],encoded["token_type_ids"][0]):
  segment = "문장 A" if type_id == 0 else "문장 B"
  if token == "[SEP]":
    segment = "구분자"
  elif token == "[CLS]":
    segment = "시작"
  print(f'{token:20s} {token_id.item():6d} {type_id.item():6d} ({segment})')

In [None]:
# [CLS] Token Pooling : BERT 첫번째 토큰 [CLS] 문서 전체의 요약 => 분류 작업을 할때
# 이 토큰의 출력만 가져와서 분류기(classifier)에 연결
import torch
from transformers import BertTokenizer, BertModel
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
sentence = "BERT is amazing for NLP tasks!"
# 인코딩
inputs =  tokenizer(sentence,return_tensors='pt')
# BERT 통과
with torch.no_grad():
  outputs = model(**inputs)
#   출력 형태 확인
last_hidden_state = outputs.last_hidden_state
print(f'입력문장 : {sentence}')
print(f'last_hidden_state 형태 : {last_hidden_state.shape}')
print(f'batch_size = 1 sequence_length = {last_hidden_state.shape[1]} \
  hidden_size = {last_hidden_state.shape[2]}')
# [CLS]토큰 추출
cls_embedding = last_hidden_state[:, 0, :]
print(f'cls_embedding 형태 : {cls_embedding.shape}')
# 분류기 (2-class)
classifier = torch.nn.Linear(768,2)
logits = classifier(cls_embedding)  # (1,2)  (batch, class개수) [[0.85,0.65]]
probs = torch.softmax(logits, dim=-1)
print(f'logits : {logits}')
print(f'probs : {probs}')
print(f'predicted class : {torch.argmax(probs).item()}')


In [None]:
# 미세 조정 학습 Fine-turning
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
texts = [
    "This movie is fantastic!",
    "Terrible film, waste of time.",
    "Amazing plot and great acting.",
    "Boring and predictable."
]
labels = [1, 0, 1, 0]  # 1=positive, 0=negative

# 토크나이져 모델
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 모델
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
# 데이터셋
class SimpleDataset(Dataset):
  def __init__(self, texts, labels):
    self.encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    self.labels = labels
  def __getitem__(self, idx):
    item = {key: val[idx] for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
  def __len__(self):
    return len(self.labels)
dataset = SimpleDataset(texts,labels)
loader = DataLoader(dataset, batch_size=2)
# 학습설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
# 미세조정
model.train()
for epoch in range(20):
  total_loss = 0
  for batch in loader:
    optimizer.zero_grad()
    inputs = { k:v.to(device) for k,v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)
    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'epoch : {epoch+1}, loss : {total_loss/len(loader)}')

