## References

- https://github.com/huggingface/transformers/blob/5f2a3d721c514cb160c74d2f2df6b729c2f99b2d/examples/text-classification/run_tf_text_classification.py

- [Jigsaw TPU: DistilBERT with Huggingface and Keras](https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras)

- https://huggingface.co/transformers/master/training.html

In [1]:
import os
import sys

In [2]:
if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/kaggle/kaggle-quora-question-pairs/notebook

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/kaggle/kaggle-quora-question-pairs/notebook


In [3]:
!pip install transformers -q
!pip install datasets -q
!pip install wandb -q

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold
import transformers
import datasets
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer,
    TFAutoModelForSequenceClassification,
)


In [5]:
import wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mnamakemono[0m (use `wandb login --relogin` to force relogin)


In [6]:
class Config:
    def __init__(self):
        self.debug = False
        self.output_directory = "../output"
        self.train_filepath = "../input/quora-question-pairs/train.csv"
        self.test_filepath = "../input/quora-question-pairs/test.csv"
        self.model_name = "distilroberta-base"
        self.max_seq_length = 128
        self.batch_size = 32
        self.epochs = 3
        self.num_splits = 3
        self.learning_rate=3e-5
        self.project_name = "quora-question-pairs"
        self.name = "_".join([
            self.model_name,
            "seqlen-%d" % self.max_seq_length,
            "debug" if self.debug else "prod"
        ])
        self.weights_filepath_list = [f"{self.output_directory}/{self.name}-{kfold_index}.h5" for kfold_index in range(self.num_splits)]
        
config = Config()

In [7]:
wandb.config.learning_rate = config.learning_rate
wandb.config.batch_size = config.batch_size
wandb.config.epochs = config.epochs

In [8]:
os.makedirs(config.output_directory, exist_ok=True)

In [9]:
train_df = pd.read_csv(config.train_filepath)

In [10]:
def preprocess(df):
    df.fillna("", inplace=True)
preprocess(train_df)

In [11]:
if config.debug:
    train_df = train_df.sample(300)

In [12]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [13]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name, use_fast=True)

In [14]:
# cf. https://www.kaggle.com/nandanapoorv/entity-recognition-with-tf-keras-and-huggingface
# cf. https://www.kaggle.com/cdeotte/tensorflow-roberta-0-705
def build_model():
    x_input_ids = tf.keras.Input(shape=(config.max_seq_length, ), dtype=tf.int32)
    x_attention_mask = tf.keras.Input(shape=(config.max_seq_length, ), dtype=tf.int32)
    x_token_type_ids = tf.keras.Input(shape=(config.max_seq_length, ), dtype=tf.int32)
    bert_model = transformers.TFAutoModel.from_pretrained(config.model_name)
    outputs = bert_model(
        input_ids=x_input_ids,
        attention_mask=x_attention_mask,
        token_type_ids=x_token_type_ids
    )
    prediction = tf.keras.layers.Dense(1, activation="sigmoid")(outputs.pooler_output)
    model = tf.keras.Model(
        inputs=[x_input_ids, x_attention_mask, x_token_type_ids],
        outputs=[prediction]
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=config.learning_rate),
        loss="binary_crossentropy",
        metrics=["acc"]
    )
    return model

In [15]:
def to_dataset(df):
    input_shape = (len(df), config.max_seq_length)
    input_ids = np.zeros(input_shape, dtype=np.int32)
    attention_mask = np.zeros(input_shape, dtype=np.int32)
    token_type_ids = np.zeros(input_shape, dtype=np.int32)
    tokens = df.apply(lambda example: tokenizer(
        example["question1"],
        example["question2"],
        truncation=True,
        max_length=config.max_seq_length,
        padding="max_length"
    ), axis=1)
    for i, token in enumerate(tokens):
        input_ids[i] = token["input_ids"]
        attention_mask[i] = token["attention_mask"]
        if "token_type_ids" in token:
            token_type_ids[i] = token["token_type_ids"]
    x = (input_ids, attention_mask, token_type_ids)
    y = df["is_duplicate"].values
    return x, y

In [16]:
kf = KFold(n_splits=config.num_splits, shuffle=True, random_state=777)
oof = np.zeros(len(train_df))
for kfold_index, (train_index, valid_index) in enumerate(kf.split(train_df)):
    print(f"Create W&B project: {config.name}")
    with wandb.init(project=config.project_name, name=config.name) as wb:
        weights_filepath = f"../output/{config.model_name}-{kfold_index}.h5"
        callbacks = [
            tf.keras.callbacks.ModelCheckpoint(
                weights_filepath,
                monitor="val_loss",
                verbose=1,
                save_best_only=True,
                save_weights_only=True
            ),
            wandb.keras.WandbCallback()
        ]
        print("Convert to train dataset.")
        x_train, y_train = to_dataset(train_df.iloc[train_index])
        x_valid, y_valid = to_dataset(train_df.iloc[valid_index])
        print("Buiild model")
        model = build_model()
        if kfold_index == 0:
            print(model.summary())
        print("Start train")
        model.fit(
            x_train, y_train,
            validation_data=(x_valid, y_valid),
            callbacks=callbacks,
            epochs=config.epochs,
            batch_size=config.batch_size
        )
        print("Start predict")
        model.load_weights(weights_filepath)
        oof[valid_index] = model.predict(x_valid).flatten()

Create W&B project: distilroberta-base_seqlen-128_prod


[34m[1mwandb[0m: Currently logged in as: [33mnamakemono[0m (use `wandb login --relogin` to force relogin)


Convert to train dataset.
Buiild model


Some layers from the model checkpoint at distilroberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
_________________________________________

[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: 


Epoch 2/3

Epoch 00002: val_loss improved from 0.26881 to 0.25724, saving model to ../output/distilroberta-base-0.h5
Epoch 3/3

Epoch 00003: val_loss did not improve from 0.25724
Start predict


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,2.0
loss,0.1874
acc,0.92413
val_loss,0.26401
val_acc,0.89899
_runtime,7233.0
_timestamp,1616906691.0
_step,2.0
best_val_loss,0.25724
best_epoch,1.0


0,1
epoch,▁▅█
loss,█▄▁
acc,▁▅█
val_loss,█▁▅
val_acc,▁▃█
_runtime,▁▅█
_timestamp,▁▅█
_step,▁▅█


Create W&B project: distilroberta-base_seqlen-128_prod


Convert to train dataset.
Buiild model


Some layers from the model checkpoint at distilroberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Start train
Epoch 1/3

Epoch 00001: val_loss improved from inf to 0.26811, saving model to ../output/distilroberta-base-1.h5


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: 


Epoch 2/3

Epoch 00002: val_loss did not improve from 0.26811
Epoch 3/3

Epoch 00003: val_loss improved from 0.26811 to 0.25563, saving model to ../output/distilroberta-base-1.h5
Start predict


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,2.0
loss,0.18653
acc,0.9241
val_loss,0.25563
val_acc,0.90169
_runtime,7197.0
_timestamp,1616914230.0
_step,2.0
best_val_loss,0.25563
best_epoch,2.0


0,1
epoch,▁▅█
loss,█▄▁
acc,▁▆█
val_loss,▇█▁
val_acc,▁▅█
_runtime,▁▄█
_timestamp,▁▄█
_step,▁▅█


Create W&B project: distilroberta-base_seqlen-128_prod


Convert to train dataset.
Buiild model


Some layers from the model checkpoint at distilroberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Start train
Epoch 1/3

Epoch 00001: val_loss improved from inf to 0.26322, saving model to ../output/distilroberta-base-2.h5


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: 


Epoch 2/3

Epoch 00002: val_loss improved from 0.26322 to 0.25852, saving model to ../output/distilroberta-base-2.h5
Epoch 3/3

Epoch 00003: val_loss improved from 0.25852 to 0.24979, saving model to ../output/distilroberta-base-2.h5
Start predict


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,2.0
loss,0.18728
acc,0.92368
val_loss,0.24979
val_acc,0.89956
_runtime,7219.0
_timestamp,1616921785.0
_step,2.0
best_val_loss,0.24979
best_epoch,2.0


0,1
epoch,▁▅█
loss,█▄▁
acc,▁▆█
val_loss,█▆▁
val_acc,▁▄█
_runtime,▁▅█
_timestamp,▁▅█
_step,▁▅█


In [17]:
y = train_df["is_duplicate"].values
print("positive: %.4f" % np.mean(y))
print("acc: %.4f" % accuracy_score(y, oof > 0.5))

positive: 0.3692
acc: 0.8968
