In [1]:
import torch
from torch import nn
from transformers import ElectraModel, ElectraTokenizer

In [2]:
model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['electra.embeddings.position_ids']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
print(type(model))
print(type(tokenizer))

<class 'transformers.modeling_electra.ElectraModel'>
<class 'transformers.tokenization_electra.ElectraTokenizer'>


In [25]:
sentence1 = "오늘은 불금이지만 난 집에 박혀 코딩을 하고 있다"
sentence2 = "그래도 난 행복해 길게 길게 길게 만들자"

In [26]:
inputs = tokenizer(
            sentence1, sentence2,
            return_tensors='pt',
            truncation=True,
            max_length=50,
            pad_to_max_length=True,
            add_special_tokens=True
        )

In [27]:
inputs

{'input_ids': tensor([[    2,  6451,  4112, 27123,  6965,  4172,  2239,  3354,  4073, 19556,
         26843,  4292, 14227,  3249,  4176,     3,  7505,  2239,  7003,  4151,
          2139,  4325,  2139,  4325,  2139,  4325,  6284,  4195,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}

In [34]:
inputs['input_ids'] # 모델의 input이 되는 tensor

tensor([[    2,  6451,  4112, 27123,  6965,  4172,  2239,  3354,  4073, 19556,
         26843,  4292, 14227,  3249,  4176,     3,  7505,  2239,  7003,  4151,
          2139,  4325,  2139,  4325,  2139,  4325,  6284,  4195,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [30]:
inputs['token_type_ids'] # 문장을 구분짓는 id -> 한 문장인 경우 다 0, 두 문장인 경우 0과 1로 구성

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])

In [31]:
inputs['attention_mask'] # padding을 구분짓는 변수 -> 0이면 padding 1이면 글자

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])

In [36]:
print(tokenizer.tokenize(sentence1)) # 입력된 문장은 토큰화된 결과

['오늘', '##은', '불금', '##이지', '##만', '난', '집', '##에', '박혀', '코딩', '##을', '하고', '있', '##다']


In [37]:
print(tokenizer.encode(sentence1))

[2, 6451, 4112, 27123, 6965, 4172, 2239, 3354, 4073, 19556, 26843, 4292, 14227, 3249, 4176, 3]


In [38]:
print(tokenizer.decode(tokenizer.encode(sentence1)))

[CLS] 오늘은 불금이지만 난 집에 박혀 코딩을 하고 있다 [SEP]
