In [1]:
# hugging face transformers 라이브러리 설치
!pip install transformers



In [1]:
import torch


In [2]:
from transformers import AutoTokenizer, TFBertModel
import tensorflow as tf
# bert-base-uncased : version, tokenizer와 model은 version을 맞춰야 한다.
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = TFBertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
outputs = model(inputs)

last_hidden_states = outputs.last_hidden_state
# dense layer로 head 붙여서 쓰면 된다.

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [3]:

txt = "Hello, my dog is cute"

In [4]:
inputs = tokenizer(txt, return_tensors="tf")

In [5]:
outputs = model(inputs)

In [6]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

# 네이버 리뷰 데이터를 활용한 한국어 감성 분석
네이버 영화 리뷰데이터(Naver Sentiment Movie Corpus,NSMC)를 활용해서 감정분석을 수행했습니다.

In [3]:
import os

import numpy as np
import pandas as pd

from datetime import datetime
import json
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## 데이터 불러오기

In [64]:
# train = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', header=0, delimiter='\t' ,quoting=3)
# test = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', header=0, delimiter='\t' ,quoting=3)

train = pd.read_csv(r'data/train_aug2.csv', index_col = None)
train = train.dropna()
train = train.drop('Unnamed: 0', axis=1)
train.columns = ['document', 'label']
test = pd.read_csv(r'data/test_final.csv')
test.columns = ['document', 'label']

In [25]:
display(train.head())
display(test.head())

Unnamed: 0,document,label
0,부랴부랴 왔는데 아무도 안왔네. 시간개념들이 없네,0
1,맞아. 사람들이 진짜 개념이없다,0
2,저렇게는 살지 말아야지,0
3,그러게 게으르고 멍청한 사람들은 맞아야해,0
4,인방 보는 남자는 거르는게 맞다,0


Unnamed: 0,document,label
0,명절이면 좀 일찍 마쳐주지,0
1,원래 틀딱들은 눈치가 없어서 ㅋㅋㅋ,0
2,저 나이 먹고 저렇게 눈치 없는 것도 재주다,0
3,요즘은 왜 미세먼지가 별로 없지?,0
4,호주랑 중국이랑 싸우고 있어서 중국에서 호주산석탄을 수입 안해서 그렇대,0


In [65]:
train.shape, test.shape

((778841, 2), (46830, 2))

## 데이터 탐색

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 778841 entries, 0 to 778840
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   document  778841 non-null  object
 1   label     778841 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 11.9+ MB


In [28]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46830 entries, 0 to 46829
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  46830 non-null  object
 1   label     46830 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 731.8+ KB


In [66]:
# 중복확인
train['document'].nunique(), train['label'].nunique()

(770406, 2)

In [67]:
# 중복제거
train.drop_duplicates(subset=['document'], inplace=True)

In [68]:
# 데이터 분포 확인
train.label.value_counts()

1    445178
0    325228
Name: label, dtype: int64

In [69]:
# 결측치 확인
train.isnull().sum()

document    0
label       0
dtype: int64

In [70]:
# 결측치 제거
train.dropna(inplace=True)

### Tokenizer를 이용한 인코딩

- 한국어 BERT :  https://huggingface.co/snunlp/KR-BERT-char16424

- Fast Tokenizer : https://huggingface.co/docs/transformers/main_classes/tokenizer

In [34]:
# hugging face transformers 설치
# !pip install transformers

In [71]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("klue/bert-base")
# config.save_pretrained("bert-base")

In [36]:
config

BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

In [37]:
from transformers import BertTokenizerFast, BertModel, AutoTokenizer

# tokenizer = BertTokenizerFast.from_pretrained("klue/bert-base")
# bert를 위한 tokenkzer : 서브워드 토크나이저. 더 작은 단위로 쪼개준다.
# working > work, ##ing
# out of vocabulary 문제가 완화된다.
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [38]:
tokenizer

BertTokenizerFast(name_or_path='klue/bert-base', vocab_size=32000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [72]:
# 인코딩 전 test data의 결측치 제거
test = test.dropna()
print(len(test))

46830


In [73]:
# 데이터셋 크기 줄이기
num_train = 20000
num_test = 5000
train = train.sample(frac=1).reset_index(drop=True)

train = train[:num_train]
test = test[:num_test]

In [74]:
num_train = len(train)
num_test = len(test)

In [75]:
train.shape, test.shape

((20000, 2), (5000, 2))

In [76]:
train = train.sample(frac=1).reset_index(drop=True)
X_train_list = train['document'].tolist()
X_test_list = test['document'].tolist()
y_train = train['label'].tolist()
y_test = test['label'].tolist()

In [77]:
X_train_list[:2]

['이 개그튼 련이? 내가 시간관리를 못했다 이거냐? 춘 줜나 이때까지 과제만 했는데 니가 뭘 알아', '교통사고, 출생의비밀 개 잣같음']

In [78]:
tokenizer(X_train_list[:2])

{'input_ids': [[2, 1504, 9143, 2677, 983, 2052, 35, 732, 2116, 3641, 7604, 2138, 4047, 2062, 4647, 2529, 35, 1674, 1, 5107, 2299, 2118, 4897, 2154, 1902, 13964, 801, 2116, 1099, 4860, 3], [2, 8687, 16, 9119, 2079, 2151, 2531, 558, 1524, 2246, 2053, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [79]:
tokenizer(X_train_list[:2],  truncation=True, padding=True, return_tensors = 'tf')

{'input_ids': <tf.Tensor: shape=(2, 31), dtype=int32, numpy=
array([[    2,  1504,  9143,  2677,   983,  2052,    35,   732,  2116,
         3641,  7604,  2138,  4047,  2062,  4647,  2529,    35,  1674,
            1,  5107,  2299,  2118,  4897,  2154,  1902, 13964,   801,
         2116,  1099,  4860,     3],
       [    2,  8687,    16,  9119,  2079,  2151,  2531,   558,  1524,
         2246,  2053,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(2, 31), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(2, 31), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1

In [80]:
X_train = tokenizer(X_train_list, truncation=True, padding=True, return_tensors = 'tf')
X_test = tokenizer(X_test_list, truncation=True, padding=True, return_tensors = 'tf')

In [48]:
X_train[0]

Encoding(num_tokens=150, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [49]:
print(X_train[0].tokens)

['[CLS]', 'ㅈ', '##ㄴ', '##게', '미개', '##하', '##다', 'ㅋㅋㅋ', '귀여워', '##서', '머', '##가리', '총', '맞', '##음', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [50]:
print(X_train[0].ids)

[2, 190, 2755, 2318, 25394, 2205, 2062, 3901, 16612, 2112, 1058, 10809, 1668, 1047, 2053, 35, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [51]:
len(X_train[0].ids)

150

### 데이터셋 생성

In [81]:
import tensorflow as tf

# 주어진 데이터소스를 여러 Tensor로 자른 후 iterator(반복가능 객체)로 만들기
# 데이터를 배치단위로 묶어주는것. 큰 멘토스에서 멘토스 8개씩 묶어준다.
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test),
    y_test
))

In [53]:
list(train_dataset.take(1))

[({'input_ids': <tf.Tensor: shape=(150,), dtype=int32, numpy=
   array([    2,   190,  2755,  2318, 25394,  2205,  2062,  3901, 16612,
           2112,  1058, 10809,  1668,  1047,  2053,    35,     3,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,    

In [82]:
example = val_dataset.take(1)
print(val_dataset.take(1))
a = list(example.as_numpy_iterator())
a

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(57,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(57,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(57,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>


[({'input_ids': array([    2,  7930, 24094,  1556,  5947, 30896,  2223,  2118,     3,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0]),
   'token_type_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
   'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
  0)]

#### tf.data 데이터셋 생성하기
- https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices

In [83]:
# Dataset.from_tensor_slice 예시
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [3, 4]])
list(dataset.as_numpy_iterator())

[array([1, 2]), array([3, 4])]

In [84]:
dataset = tf.data.Dataset.range(8)
dataset = dataset.batch(3)
list(dataset.as_numpy_iterator())

[array([0, 1, 2], dtype=int64),
 array([3, 4, 5], dtype=int64),
 array([6, 7], dtype=int64)]

- 자주 쓰는 tf.data.dataset 하위 메서드

    -  repeat(n) - Dataset n번 반복해서 하나로 붙힘

    - skip(n) - Dataset 반복자로 돌릴 때 n만큼 skip. n이 7이라면 1~10까지 돌리면 7까지 생략 후 8,9,10 나옴

    - batch(n) - Dataset을 n개의 batch로 쪼갬

    - shuffle(buffer_size, seed=None, reshuffle_each_iteration=None) : dataset을 섞기

### 모델 학습

- model source : https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/models/bert/modeling_tf_bert.py#L1605

### SubClass

In [85]:
import tensorflow as tf
class MyModel(tf.keras.Model):
    def __init__(self, model_name):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(1,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='sigmoid',
                                                name='classifier')
    # call : 선언되어 있는 객체를 call하는 함수
    # model = MyModel(a) --> 생성자 호출
    # model(b) --> call 호출
    def call(self, input_ids = None, attention_mask=None, token_type_ids=None):
        # input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [86]:
model = MyModel("klue/bert-base")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [87]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()

model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy',
                                                                            tf.keras.metrics.TrueNegatives(name='true_negatives'),
                                                                            tf.keras.metrics.TruePositives(name='true_positives'),
                                                                            tf.keras.metrics.FalseNegatives(name='false_negatives'),
                                                                            tf.keras.metrics.FalsePositives(name='false_positives')])

In [88]:
from tensorflow.keras.callbacks import EarlyStopping

callback_earlystop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=2)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", 
    factor=0.2, # new_lr = lr * factor.
    patience=2,
    cooldown=2, # number of epochs to wait before resuming normal operation after lr has been reduced.  
    min_lr=0
)
model.fit(
    train_dataset.shuffle(num_train).batch(32),
    epochs=100, batch_size=32,
    validation_data = val_dataset.shuffle(num_test).batch(32),
    callbacks = [callback_earlystop, reduce_lr]
)
model.save_weights('bert_weight')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [45]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tf_bert_model_1 (TFBertMode  multiple                 110617344 
 l)                                                              
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 110,618,113
Trainable params: 110,618,113
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.save_weights('bert_weight')
# model.load_weights('bert_weight')

#### HuggingFace Model 활용하기

In [45]:
from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_111 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 110,618,882
Trainable params: 110,618,882
Non-trainable params: 0
_________________________________________________________________


In [47]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)

In [48]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [49]:
from tensorflow.keras.callbacks import EarlyStopping

callback_earlystop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=2)

model.fit(
    train_dataset.shuffle(num_train).batch(32), epochs=1, batch_size=32,
    validation_data = val_dataset.shuffle(num_test).batch(32),
    callbacks = [callback_earlystop]
)



<keras.callbacks.History at 0x7e2b0eb6d300>

In [50]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_111 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 110,618,882
Trainable params: 110,618,882
Non-trainable params: 0
_________________________________________________________________


### 모델 테스트

In [58]:
text = '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아'
inputs = tokenizer(text, return_tensors="tf")
inputs

{'input_ids': <tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[    2,  1097,  2275,  1504, 20609,  2031,  2073,    18,    18,
           18,    18,  8170,  2043,  1380,  3683,  3633,  2532,  5708,
         2259, 14236,  3614,  9958,     3]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]], dtype=int32)>}

In [52]:
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.48946533, -0.68699247]], dtype=float32)>, hidden_states=None, attentions=None)

In [60]:
model.predict(inputs.input_ids)
# predict에는 특정 데이터만 넣는다?



TFSequenceClassifierOutput(loss=None, logits=array([[ 0.48946485, -0.68699205]], dtype=float32), hidden_states=None, attentions=None)

In [56]:
text = '노잼 영화'
inputs = tokenizer(text, return_tensors="tf")
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 2.1713727, -2.0818331]], dtype=float32)>, hidden_states=None, attentions=None)

In [54]:
text = '재밌음. 킬링 타임용으로 볼 만한 영화'
inputs = tokenizer(text, return_tensors="tf")
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-1.8879249,  1.5185672]], dtype=float32)>, hidden_states=None, attentions=None)

In [55]:
text = '킬링 타임용으로 볼 만한 영화'
inputs = tokenizer(text, return_tensors="tf")
model(**inputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.01443638, -0.70625836]], dtype=float32)>, hidden_states=None, attentions=None)

## seoyeon96

In [2]:
from transformers import TFBertForSequenceClassification

test_model = TFBertForSequenceClassification.from_pretrained("seoyeon96/KcELECTRA-MLM", from_pt=True)
test_model.summary()

RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
[WinError 182] 운영 체제가 %1을(를) 실행할 수 없습니다. Error loading "c:\Users\llljw\anaconda3\envs\mzpark\lib\site-packages\torch\lib\shm.dll" or one of its dependencies.

In [46]:
from transformers import TFBertForMaskedLM
from transformers import AutoTokenizer

masked_model = TFBertForMaskedLM.from_pretrained("seoyeon96/KcELECTRA-MLM", from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("seoyeon96/KcELECTRA-MLM")

You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForMaskedLM: ['electra.encoder.layer.1.attention.self.value.weight', 'electra.encoder.layer.6.attention.self.query.weight', 'electra.encoder.layer.3.attention.output.dense.weight', 'electra.encoder.layer.0.intermediate.dense.bias', 'electra.encoder.layer.3.attention.self.query.bias', 'electra.encoder.layer.5.attention.self.query.weight', 'electra.encoder.layer.4.attention.self.key.bias', 'electra.encoder.layer.6.intermediate.dense.bias', 'electra.encoder.layer.4.attention.output.dense.bias', 'electra.encoder.layer.5.output.LayerNorm.bias', 'electra.encoder.layer.1.output.LayerNorm.bias', 'electra.encoder.layer.5.attention.self.value.bias', 'electra.encoder.layer.7.attention.output.dense.bias', 'electra.encoder.layer.7.attention.self.value.weight', '

Downloading (…)okenizer_config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/396k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [47]:
masked_model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  123954432 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  39542231  
                                                                 
Total params: 124,596,695
Trainable params: 124,596,695
Non-trainable params: 0
_________________________________________________________________


In [1]:
tr_bad = train[train['label'] == 1]

NameError: name 'train' is not defined

In [50]:
# Text -> Tokenization
inputs = tokenizer(X_train_list, return_tensors = 'tf', padding=True, truncation=True) #tensorflow tensor로 불러오라는 옵션

In [52]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [53]:
masked_model(inputs)

: 

In [None]:
# [MASK] 토큰 예측하기
from transformers import FillMaskPipeline

mask_pip = FillMaskPipeline(model=masked_model, tokenizer = tokenizer)