In [6]:
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentences = [
    'Hello, world!',
    'unbelievable performance!',
    'COVID-19 pandemic'
]
for sentence in sentences:
   # 토큰화
   tokens =  tokenizer.tokenize(sentence)
   print(f'원문 : {sentence}')
   print(f'토큰 : {tokens}')

   #ID 변환
   ids = tokenizer.convert_tokens_to_ids(tokens)
   print(f'ID : {ids}')

   # 역변환
   decoded_string = tokenizer.decode(ids)
   print(f'역변환 : {decoded_string}\n')

원문 : Hello, world!
토큰 : ['hello', ',', 'world', '!']
ID : [7592, 1010, 2088, 999]
역변환 : hello, world!

원문 : unbelievable performance!
토큰 : ['unbelievable', 'performance', '!']
ID : [23653, 2836, 999]
역변환 : unbelievable performance!

원문 : COVID-19 pandemic
토큰 : ['co', '##vid', '-', '19', 'pan', '##de', '##mic']
ID : [2522, 17258, 1011, 2539, 6090, 3207, 7712]
역변환 : covid - 19 pandemic



In [7]:
# 2. Attention Mask : 실제단어 1  , 패딩은 0
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentences = [
    'short sentence',
    'Thiis is a much longer sentence with more words'
]
# 여러문장을 한꺼번에 토크나이징하고 가장 긴 문장길이에 맞춰 자동 패딩 수행
encoded = tokenizer(
    sentences,
    padding=True,
    return_tensors='pt'
)
encoded


{'input_ids': tensor([[  101,  2460,  6251,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 16215,  6137,  2015,  2003,  1037,  2172,  2936,  6251,  2007,
          2062,  2616,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
# token_type_ids : 두 문장을 입력할때 첫번째 ,두번째 구분
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sentence_A = "The weather is nice"
sentence_B = "Let's go for a walk"
# 두 문장을 하나의 입력으로 인코딩
encoded = tokenizer(
    sentence_A,
    sentence_B,
    padding=True,
    return_tensors="pt"
)
tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][0])
for token,token_id, type_id in zip(tokens, encoded["input_ids"][0],encoded["token_type_ids"][0]):
  segment = "문장 A" if type_id == 0 else "문장 B"
  if token == "[SEP]":
    segment = "구분자"
  elif token == "[CLS]":
    segment = "시작"
  print(f'{token:20s} {token_id.item():6d} {type_id.item():6d} ({segment})')

[CLS]                   101      0 (시작)
the                    1996      0 (문장 A)
weather                4633      0 (문장 A)
is                     2003      0 (문장 A)
nice                   3835      0 (문장 A)
[SEP]                   102      0 (구분자)
let                    2292      1 (문장 B)
'                      1005      1 (문장 B)
s                      1055      1 (문장 B)
go                     2175      1 (문장 B)
for                    2005      1 (문장 B)
a                      1037      1 (문장 B)
walk                   3328      1 (문장 B)
[SEP]                   102      1 (구분자)


In [28]:
# [CLS] Token Pooling : BERT 첫번째 토큰 [CLS] 문서 전체의 요약 => 분류 작업을 할때
# 이 토큰의 출력만 가져와서 분류기(classifier)에 연결
import torch
from transformers import BertTokenizer, BertModel
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
sentence = "BERT is amazing for NLP tasks!"
# 인코딩
inputs =  tokenizer(sentence,return_tensors='pt')
# BERT 통과
with torch.no_grad():
  outputs = model(**inputs)
#   출력 형태 확인
last_hidden_state = outputs.last_hidden_state
print(f'입력문장 : {sentence}')
print(f'last_hidden_state 형태 : {last_hidden_state.shape}')
print(f'batch_size = 1 sequence_length = {last_hidden_state.shape[1]} \
  hidden_size = {last_hidden_state.shape[2]}')
# [CLS]토큰 추출
cls_embedding = last_hidden_state[:, 0, :]
print(f'cls_embedding 형태 : {cls_embedding.shape}')
# 분류기 (2-class)
classifier = torch.nn.Linear(768,2)
logits = classifier(cls_embedding)  # (1,2)  (batch, class개수) [[0.85,0.65]]
probs = torch.softmax(logits, dim=-1)
print(f'logits : {logits}')
print(f'probs : {probs}')
print(f'predicted class : {torch.argmax(probs).item()}')


입력문장 : BERT is amazing for NLP tasks!
last_hidden_state 형태 : torch.Size([1, 10, 768])
batch_size = 1 sequence_length = 10   hidden_size = 768
cls_embedding 형태 : torch.Size([1, 768])
logits : tensor([[-0.0271,  0.3037]], grad_fn=<AddmmBackward0>)
probs : tensor([[0.4181, 0.5819]], grad_fn=<SoftmaxBackward0>)
predicted class : 1


In [7]:
# 미세 조정 학습 Fine-turning
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
texts = [
    "This movie is fantastic!",
    "Terrible film, waste of time.",
    "Amazing plot and great acting.",
    "Boring and predictable."
]
labels = [1, 0, 1, 0]  # 1=positive, 0=negative

# 토크나이져
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 모델
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
# 데이터셋
class SimpleDataset(Dataset):
  def __init__(self, texts, labels):
    self.encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    self.labels = labels
  def __getitem__(self, idx):
    item = {key: val[idx] for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
  def __len__(self):
    return len(self.labels)
dataset = SimpleDataset(texts,labels)
loader = DataLoader(dataset, batch_size=2)
# 학습설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
# 미세조정(학습)
model.train()
for epoch in range(30):
  total_loss = 0
  for batch in loader:
    optimizer.zero_grad()
    inputs = { k:v.to(device) for k,v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)
    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'epoch : {epoch+1}, loss : {total_loss/len(loader)}')



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


epoch : 1, loss : 0.6602306365966797
epoch : 2, loss : 0.6472322344779968
epoch : 3, loss : 0.7354162931442261
epoch : 4, loss : 0.6062647104263306
epoch : 5, loss : 0.6358301639556885
epoch : 6, loss : 0.4682944118976593
epoch : 7, loss : 0.4825413376092911
epoch : 8, loss : 0.467064306139946
epoch : 9, loss : 0.442411333322525
epoch : 10, loss : 0.39650824666023254
epoch : 11, loss : 0.39948126673698425
epoch : 12, loss : 0.3394634574651718
epoch : 13, loss : 0.2713230699300766
epoch : 14, loss : 0.23300963640213013
epoch : 15, loss : 0.18635407835245132
epoch : 16, loss : 0.1873004287481308
epoch : 17, loss : 0.17201333492994308
epoch : 18, loss : 0.16242393851280212
epoch : 19, loss : 0.16074499487876892
epoch : 20, loss : 0.1480371654033661
epoch : 21, loss : 0.10125178471207619
epoch : 22, loss : 0.11364258080720901
epoch : 23, loss : 0.08941005915403366
epoch : 24, loss : 0.08148659020662308
epoch : 25, loss : 0.08742554113268852
epoch : 26, loss : 0.05688166245818138
epoch : 27

In [20]:
# 추론
model.eval() # 평가모드
sample_sentences = [
"I am really disappointed with the result.",
"The service was terrible and not worth the money.",
"I don't like this product at all."
]
# 토큰화
inputs =  tokenizer(
    sample_sentences,
    truncation=True,
    padding=True,
    return_tensors='pt'
)
# gpu/cpu 설정
inputs = { k: v.to(device) for k,v in inputs.items()}
 # 추론
with torch.no_grad():
   outputs = model(**inputs)
   logits = outputs.logits
   probs = torch.softmax(logits, dim=-1)
   pred = torch.argmax(probs, dim=-1).detach().numpy()
   print(pred,probs)
# probs, pred  # 1=positive, 0=negative


[1 0 1] tensor([[0.3561, 0.6439],
        [0.6361, 0.3639],
        [0.2792, 0.7208]])
