In [1]:
!pip install datasets



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

from transformers import AdamW
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

In [3]:
data = load_dataset("mteb/banking77")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 3080
    })
})

In [5]:
train_data = data['train']
test_data = data['test']

train_labels = train_data['label']
test_labels = test_data['label']

In [6]:
print(train_data)
print(test_data)

print(test_labels)
print(train_labels)

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 10003
})
Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 3080
})
[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
def tokenize_function(examples):
  return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128
        )

encoded_train = train_data.map(tokenize_function, batched = True)
encoded_test = test_data.map(tokenize_function, batched = True)

In [9]:
print(encoded_train)
print(encoded_test)

Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10003
})
Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3080
})


In [10]:
def create_tf_dataset(features, labels, batch_size=16):
    def gen():
        for feature, label in zip(features, labels):
            input_ids = feature['input_ids']
            attention_mask = feature['attention_mask']

            # Ensure the input_ids and attention_mask are of length 128
            if len(input_ids) != 128:
                input_ids = input_ids[:128] + [0] * (128 - len(input_ids))
            if len(attention_mask) != 128:
                attention_mask = attention_mask[:128] + [0] * (128 - len(attention_mask))

            yield {'input_ids': input_ids,
                   'attention_mask': attention_mask}, label

    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                'input_ids': tf.TensorSpec(shape=(128,), dtype=tf.int32),
                'attention_mask': tf.TensorSpec(shape=(128,), dtype=tf.int32)
            },
            tf.TensorSpec(shape=(), dtype=tf.int32)
        )
    ).batch(batch_size)

In [11]:
train_tf_dataset = create_tf_dataset(encoded_train, train_labels, batch_size=16)
test_tf_dataset = create_tf_dataset(encoded_test, test_labels, batch_size=16)

In [12]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 75)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = ['accuracy']
)

In [18]:
import tensorflow as tf
import numpy as np

def check_for_nans(encoded_dataset):
    for feature in encoded_dataset:
        for key in feature:
            tensor = tf.convert_to_tensor(feature[key])
            if tensor.dtype.is_floating:
                if tf.reduce_any(tf.math.is_nan(tensor)):
                    print(f"NaN found in {key}")
            elif tensor.dtype.is_integer:
                if np.any(np.isnan(tensor.numpy())):
                    print(f"NaN found in {key}")
            else:
                print(f"Skipping non-numeric data in {key}")

# Check for NaNs in the training data
check_for_nans(encoded_train)

# Check for NaNs in the test data
check_for_nans(encoded_test)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-numeric data in label_text
Skipping non-numeric data in text
Skipping non-