In [1]:
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
SEQ_LEN = 128 # sequence length
BATCH_SIZE = 10
EPOCHS = 10

# Load data

In [3]:
with open('data/train.json', 'r', encoding='utf8') as f:
    data = json.load(f)
rows = []
for i in tqdm(range(len(data))):
    obj = data[i]
    rows.append({'question': obj['question'], 'text': obj['text'], 'label': obj['label']})
df = pd.DataFrame(rows)

HBox(children=(IntProgress(value=0, max=18108), HTML(value='')))




In [4]:
questions = list(df['question'].values)
texts = list(df['text'].values)
labels = df['label'].values.astype(float)

# Preprocess

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [6]:
tokens = []
sentence_masks = []
attention_masks = []

for question, text in tqdm(zip(questions, texts)):
    input_dict = tokenizer.encode_plus(question, text, add_special_tokens=True, max_length=SEQ_LEN)
    token, sentence_mask = input_dict['input_ids'], input_dict['token_type_ids']
    
    # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
    attention_mask = [1] * len(token)
    
    # Zero-pad up to the sequence length.
    padding_length = SEQ_LEN - len(token)
    token = token + ([0] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    sentence_mask = sentence_mask + ([0] * padding_length)
    
    tokens.append(token)
    sentence_masks.append(sentence_mask)
    attention_masks.append(attention_mask)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (630 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi




In [7]:
tokens = np.array(tokens)
sentence_masks = np.array(sentence_masks)
attention_masks = np.array(attention_masks)

In [8]:
tokens_train, tokens_val, \
sentence_masks_train, sentence_masks_val, \
attention_masks_train, attention_masks_val, \
labels_train, labels_val = train_test_split(tokens, sentence_masks, attention_masks, labels, 
                                             test_size=0.1, random_state=42)

In [9]:
train_data = ({'input_ids': tokens_train, 'attention_mask': attention_masks_train, 'token_type_ids': sentence_masks_train}, 
              labels_train)
val_data = ({'input_ids': tokens_val, 'attention_mask': attention_masks_val, 'token_type_ids': sentence_masks_val}, 
              labels_val)

# Model

In [10]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
# model = TFBertForSequenceClassification.from_pretrained('./model/')

In [11]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

early_stop = tf.keras.callbacks.EarlyStopping(patience=3, verbose=1, restore_best_weights=True)

In [12]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(x=train_data[0], y=train_data[1], 
                    batch_size=BATCH_SIZE, epochs=EPOCHS, 
                    callbacks=[early_stop], validation_data=val_data)

Train on 16297 samples, validate on 1811 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 00006: early stopping


# Find optimal threshold

In [18]:
logits = model.predict(val_data[0])
probs = tf.math.softmax(logits).numpy()
threshold_range = np.arange(0.5, 1.0, 0.05)

In [19]:
for threshold in threshold_range:
    preds = []
    for prob in probs:
        if prob[1] > threshold:
            preds.append(1)
        else:
            preds.append(0)
    preds = np.array(preds)
    accuracy = np.mean(np.equal(preds, val_data[1]))
    f1 = f1_score(val_data[1], preds)
    print(f'Prediction threshold = {threshold:.2f} - Validation accuracy: {accuracy*100:.4f}% - Validation f1 score: {f1:.4f}')

Prediction threshold = 0.50 - Validation accuracy: 87.1894% - Validation f1 score: 0.7895
Prediction threshold = 0.55 - Validation accuracy: 86.9133% - Validation f1 score: 0.7787
Prediction threshold = 0.60 - Validation accuracy: 87.0790% - Validation f1 score: 0.7780
Prediction threshold = 0.65 - Validation accuracy: 86.9133% - Validation f1 score: 0.7710
Prediction threshold = 0.70 - Validation accuracy: 86.2507% - Validation f1 score: 0.7542
Prediction threshold = 0.75 - Validation accuracy: 85.6985% - Validation f1 score: 0.7371
Prediction threshold = 0.80 - Validation accuracy: 84.8150% - Validation f1 score: 0.7090
Prediction threshold = 0.85 - Validation accuracy: 83.6002% - Validation f1 score: 0.6747
Prediction threshold = 0.90 - Validation accuracy: 82.8824% - Validation f1 score: 0.6501
Prediction threshold = 0.95 - Validation accuracy: 81.2258% - Validation f1 score: 0.6000


# Save model

In [45]:
# model.save_pretrained('./model/')