# tf-bert

## 1. Dataset

In [1]:
batch_size = 2

# **
# Create Dataset (Does the statement contains "cat" ?)
# *
texts_train = ['I like cat',
               'I do not like cat',
               'You are like a cat',
               'A cat lover never gives up',
               'He walks like a cat',
               'My favorite animal is cat',
               'I like dog',
               'I do not like dog',
               'You are like a dog',
               'Dog lovers always gives up',
               'She walks like a dog',
               'My favorite animal is dog',]
labels_train = [1,
                1,
                1,
                1,
                1,
                1,
                0,
                0,
                0,
                0,
                0,
                0]
texts_valid = ['I love cat',
               'I wish I had a cat',
               'I am cat',
               'I love dog',
               'I wish I had a dog',
               'I am dog',]
labels_valid = [1,
                1,
                1,
                0,
                0,
                0]
texts_test = ['cat walked away from me',
              'I miss my dog']
labels_test = [1,
               0]

## 2. Model

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from transformers import AutoTokenizer, TFBertForSequenceClassification
CHECKPOINT = 'bert-base-uncased'

2023-03-05 23:58:00.668471: I tensorflow/core/util/util.cc:175] Experimental oneDNN custom operations are on. If you experience issues, please turn them off by setting the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Notebook の構成
* セルごとに Stand-alone で動作するように記述

仕様
* 出力はラベルではなく logits とする(loss の算出のため)

TODO

* 学習パイプラインのバリデーション（重み更新できてるのか、学習前後で比較し確認）
* データ、数件では無理があるのでちゃんとしたものに変更
* input・output を見直し End-to-end のネットワークに変更
  * input: Tensor\[str\] を受け取り tokenize・encoding する層に変更（WARNING 解消のため; ベストプラクティスなのかは要確認）
  * output: bert の logits を受け取り dense 等任意の層で受け取る実装に変更（テーブルデータとの concat 実装等、拡張する際必要）

### 2.1 素のモデル

学習できている様子を示すためにはちゃんとしたデータの用意・タスクの設定が必要。

In [3]:
# Data
def get_dataset(tokenizer, texts, labels=None, batch_size=10):
    encodings = tokenizer(texts, truncation=True, padding=True)
    if labels:
        return tf.data.Dataset.from_tensor_slices((dict(encodings), labels)).batch(batch_size)
    else:
        return tf.data.Dataset.from_tensor_slices((dict(encodings))).batch(batch_size)
        
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
ds_train = get_dataset(tokenizer, texts_train, labels=labels_train)
ds_valid = get_dataset(tokenizer, texts_valid, labels=labels_valid)
ds_test  = get_dataset(tokenizer, texts_test)

# Model
pretrained_model = TFBertForSequenceClassification.from_pretrained(CHECKPOINT)
pretrained_model.trainable = False                       # 全層重みを一時的に freeze
pretrained_model.bert.encoder.layer[-1].trainable = True # BERT 最終層は trainable
pretrained_model.layers[-1].trainable = True             # 全結合層は trainable
pretrained_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00000001),
              loss='binary_crossentropy',
              metrics='accuracy')

# Train
early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=30, verbose=0)
history = pretrained_model.fit(ds_train,
                               epochs=100,
                               batch_size=batch_size,
                               validation_data=ds_valid,
                               callbacks=[early_stopping])

# Predict label
get_pred = lambda x: np.argmax(pretrained_model.predict(x)['logits'], axis=1)
get_pred(ds_test)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


array([0, 0])

## 2.2 Sequential

WIP

In [21]:
# Data
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
train_encodings = tokenizer(texts_train, truncation=True, padding=True, return_tensors='tf')
ds_train = tf.data.Dataset \
                        .from_tensor_slices((dict(train_encodings), labels_train)) \
                        .batch(2)
test_encodings = tokenizer(texts_test, truncation=True, padding=True)
ds_test = tf.data.Dataset \
                        .from_tensor_slices((dict(test_encodings))) \
                        .batch(2)
### TODO: Data validation should be implemented

# Model
CHECKPOINT = 'bert-base-uncased'
pretrained_model = TFBertForSequenceClassification.from_pretrained(CHECKPOINT)
model = tf.keras.models.Sequential()
model.add(pretrained_model)
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics='accuracy')


# Train
model.fit(ds_train, epochs=2)

# Predict
model.predict(ds_test)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2


TypeError: in user code:

    File "/home/ubuntu/python3-venv/lib/python3.8/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/home/ubuntu/python3-venv/lib/python3.8/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/ubuntu/python3-venv/lib/python3.8/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/home/ubuntu/python3-venv/lib/python3.8/site-packages/keras/engine/training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "/home/ubuntu/python3-venv/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "<string>", line 3, in raise_from
        

    TypeError: Exception encountered when calling layer "sequential_10" "                 f"(type Sequential).
    
    Failed to convert 'TFSequenceClassifierOutput(loss=None, logits=TensorShape([None, 2]), hidden_states=None, attentions=None)' to a shape: ''logits''could not be converted to a dimension. A shape should either be single dimension (e.g. 10), or an iterable of dimensions (e.g. [1, 10, None]).
    
    Call arguments received by layer "sequential_10" "                 f"(type Sequential):
      • inputs={'input_ids': 'tf.Tensor(shape=(None, 7), dtype=int32)', 'token_type_ids': 'tf.Tensor(shape=(None, 7), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 7), dtype=int32)'}
      • training=True
      • mask=None


## Bak

In [None]:
# Construct
model = tf.keras.models.Sequential()
model.add(pretrained_model)
model.add(tf.keras.layers.Dense(2, activation='softmax'))

# BERT は最終層のみ Fine tuning
model.layers[0].layers[0].trainable = False
model.layers[0].layers[1].trainable = False
model.layers[0].layers[2].trainable = True

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics='accuracy')
# Train
early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=4, verbose=0)
history = model.fit(ds_train,
                    epochs=2,
                    batch_size=batch_size,
                    validation_data=ds_valid,
                    callbacks=[early_stopping])

# # Learning curve
# pd.DataFrame({'train': history.history['loss'],
#               'valid': history.history['val_loss']}).plot()

# **
# Predict
# *
# for X, _ in ds_train:
#     print(model.predict_on_batch(X))