In [1]:
# hugging face transformers 라이브러리 설치
!pip install transformers



In [2]:
import torch


In [3]:
from transformers import AutoTokenizer, TFBertModel
import tensorflow as tf
# bert-base-uncased : version, tokenizer와 model은 version을 맞춰야 한다.
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = TFBertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
outputs = model(inputs)

last_hidden_states = outputs.last_hidden_state
# dense layer로 head 붙여서 쓰면 된다.

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [4]:

txt = "Hello, my dog is cute"

In [5]:
inputs = tokenizer(txt, return_tensors="tf")

In [6]:
outputs = model(inputs)

In [7]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

# 네이버 리뷰 데이터를 활용한 한국어 감성 분석
네이버 영화 리뷰데이터(Naver Sentiment Movie Corpus,NSMC)를 활용해서 감정분석을 수행했습니다.

In [8]:
import os

import numpy as np
import pandas as pd

from datetime import datetime
import json
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## 데이터 불러오기

In [9]:
train = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', header=0, delimiter='\t' ,quoting=3)
test = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', header=0, delimiter='\t' ,quoting=3)

In [10]:
display(train.head())
display(test.head())

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [11]:
train.shape, test.shape

((150000, 3), (50000, 3))

## 데이터 탐색

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [14]:
# 중복확인
train['document'].nunique(), train['label'].nunique()

(146182, 2)

In [15]:
# 중복제거
train.drop_duplicates(subset=['document'], inplace=True)

In [16]:
# 데이터 분포 확인
train.label.value_counts()

label
0    73342
1    72841
Name: count, dtype: int64

In [17]:
# 결측치 확인
train.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [18]:
# 결측치 제거
train.dropna(inplace=True)

### Tokenizer를 이용한 인코딩

- 한국어 BERT :  https://huggingface.co/snunlp/KR-BERT-char16424

- Fast Tokenizer : https://huggingface.co/docs/transformers/main_classes/tokenizer

In [19]:
# hugging face transformers 설치
# !pip install transformers

In [20]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("klue/bert-base")
# config.save_pretrained("bert-base")

In [21]:
config

BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

In [22]:
from transformers import BertTokenizerFast, BertModel, AutoTokenizer

# tokenizer = BertTokenizerFast.from_pretrained("klue/bert-base")
# bert를 위한 tokenkzer : 서브워드 토크나이저. 더 작은 단위로 쪼개준다.
# working > work, ##ing
# out of vocabulary 문제가 완화된다.
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [23]:
tokenizer

BertTokenizerFast(name_or_path='klue/bert-base', vocab_size=32000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [24]:
# 인코딩 전 test data의 결측치 제거
test = test.dropna()
print(len(test))

49997


In [25]:
# 데이터셋 크기 줄이기
num_train = 10000
num_test = 5000

train = train[:num_train]
test = test[:num_test]

In [26]:
train.shape, test.shape

((10000, 3), (5000, 3))

In [27]:
X_train_list = train['document'].tolist()
X_test_list = test['document'].tolist()
y_train = train['label'].tolist()
y_test = test['label'].tolist()

In [28]:
X_train_list[:2]

['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나']

In [29]:
tokenizer(X_train_list[:2])

{'input_ids': [[2, 1376, 831, 2604, 18, 18, 4229, 9801, 2075, 2203, 2182, 4243, 3], [2, 1963, 18, 18, 18, 11811, 2178, 2088, 28883, 16516, 2776, 18, 18, 18, 18, 10737, 2156, 2015, 2446, 2232, 6758, 2118, 1380, 6074, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [30]:
tokenizer(X_train_list[:2],  truncation=True, padding=True, return_tensors = 'tf')

{'input_ids': <tf.Tensor: shape=(2, 25), dtype=int32, numpy=
array([[    2,  1376,   831,  2604,    18,    18,  4229,  9801,  2075,
         2203,  2182,  4243,     3,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0],
       [    2,  1963,    18,    18,    18, 11811,  2178,  2088, 28883,
        16516,  2776,    18,    18,    18,    18, 10737,  2156,  2015,
         2446,  2232,  6758,  2118,  1380,  6074,     3]])>, 'token_type_ids': <tf.Tensor: shape=(2, 25), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(2, 25), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]])>}

In [31]:
X_train = tokenizer(X_train_list, truncation=True, padding=True, return_tensors = 'tf')
X_test = tokenizer(X_test_list, truncation=True, padding=True, return_tensors = 'tf')

In [32]:
X_train[0]

Encoding(num_tokens=127, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [33]:
print(X_train[0].tokens)

['[CLS]', '아', '더', '##빙', '.', '.', '진짜', '짜증', '##나', '##네', '##요', '목소리', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

In [34]:
print(X_train[0].ids)

[2, 1376, 831, 2604, 18, 18, 4229, 9801, 2075, 2203, 2182, 4243, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [35]:
len(X_train[0].ids)

127

### 데이터셋 생성

In [36]:
import tensorflow as tf

# 주어진 데이터소스를 여러 Tensor로 자른 후 iterator(반복가능 객체)로 만들기
# 데이터를 배치단위로 묶어주는것. 큰 멘토스에서 멘토스 8개씩 묶어준다.
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test),
    y_test
))

In [37]:
list(train_dataset.take(1))

[({'input_ids': <tf.Tensor: shape=(127,), dtype=int32, numpy=
   array([   2, 1376,  831, 2604,   18,   18, 4229, 9801, 2075, 2203, 2182,
          4243,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0])>,
   'token_type_ids': <tf.Tensor: shape=(127,), dtype=

In [38]:
example = val_dataset.take(1)
print(val_dataset.take(1))
a = list(example.as_numpy_iterator())
a

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(107,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(107,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(107,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>


[({'input_ids': array([  2, 618, 191,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0]),
   'token_type_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0

#### tf.data 데이터셋 생성하기
- https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices

In [39]:
# Dataset.from_tensor_slice 예시
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [3, 4]])
list(dataset.as_numpy_iterator())

[array([1, 2]), array([3, 4])]

In [40]:
dataset = tf.data.Dataset.range(8)
dataset = dataset.batch(3)
list(dataset.as_numpy_iterator())

[array([0, 1, 2], dtype=int64),
 array([3, 4, 5], dtype=int64),
 array([6, 7], dtype=int64)]

- 자주 쓰는 tf.data.dataset 하위 메서드

    -  repeat(n) - Dataset n번 반복해서 하나로 붙힘

    - skip(n) - Dataset 반복자로 돌릴 때 n만큼 skip. n이 7이라면 1~10까지 돌리면 7까지 생략 후 8,9,10 나옴

    - batch(n) - Dataset을 n개의 batch로 쪼갬

    - shuffle(buffer_size, seed=None, reshuffle_each_iteration=None) : dataset을 섞기

### 모델 학습

- model source : https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/models/bert/modeling_tf_bert.py#L1605

### SubClass

In [41]:
import tensorflow as tf
class MyModel(tf.keras.Model):
    def __init__(self, model_name):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(1,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='sigmoid',
                                                name='classifier')
    # call : 선언되어 있는 객체를 call하는 함수
    # model = MyModel(a) --> 생성자 호출
    # model(b) --> call 호출
    def call(self, input_ids = None, attention_mask=None, token_type_ids=None):
        # input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [42]:
model = MyModel("klue/bert-base")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [43]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()

model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

In [44]:
from tensorflow.keras.callbacks import EarlyStopping

callback_earlystop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=2)

model.fit(
    train_dataset.shuffle(num_train).batch(32),
    epochs=1, batch_size=32,
    validation_data = val_dataset.shuffle(num_test).batch(32),
    callbacks = [callback_earlystop]
)



<keras.callbacks.History at 0x215811250c0>

In [45]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tf_bert_model_1 (TFBertMode  multiple                 110617344 
 l)                                                              
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 110,618,113
Trainable params: 110,618,113
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.save_weights('bert_weight')
# model.load_weights('bert_weight')

#### HuggingFace Model 활용하기

In [45]:
from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_111 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 110,618,882
Trainable params: 110,618,882
Non-trainable params: 0
_________________________________________________________________


In [47]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)

In [48]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [49]:
from tensorflow.keras.callbacks import EarlyStopping

callback_earlystop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=2)

model.fit(
    train_dataset.shuffle(num_train).batch(32), epochs=1, batch_size=32,
    validation_data = val_dataset.shuffle(num_test).batch(32),
    callbacks = [callback_earlystop]
)



<keras.callbacks.History at 0x7e2b0eb6d300>

In [50]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_111 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 110,618,882
Trainable params: 110,618,882
Non-trainable params: 0
_________________________________________________________________


### 모델 테스트

In [58]:
text = '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아'
inputs = tokenizer(text, return_tensors="tf")
inputs

{'input_ids': <tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[    2,  1097,  2275,  1504, 20609,  2031,  2073,    18,    18,
           18,    18,  8170,  2043,  1380,  3683,  3633,  2532,  5708,
         2259, 14236,  3614,  9958,     3]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]], dtype=int32)>}

In [52]:
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.48946533, -0.68699247]], dtype=float32)>, hidden_states=None, attentions=None)

In [60]:
model.predict(inputs.input_ids)
# predict에는 특정 데이터만 넣는다?



TFSequenceClassifierOutput(loss=None, logits=array([[ 0.48946485, -0.68699205]], dtype=float32), hidden_states=None, attentions=None)

In [56]:
text = '노잼 영화'
inputs = tokenizer(text, return_tensors="tf")
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 2.1713727, -2.0818331]], dtype=float32)>, hidden_states=None, attentions=None)

In [54]:
text = '재밌음. 킬링 타임용으로 볼 만한 영화'
inputs = tokenizer(text, return_tensors="tf")
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-1.8879249,  1.5185672]], dtype=float32)>, hidden_states=None, attentions=None)

In [55]:
text = '킬링 타임용으로 볼 만한 영화'
inputs = tokenizer(text, return_tensors="tf")
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.01443638, -0.70625836]], dtype=float32)>, hidden_states=None, attentions=None)