In [1]:
# dependencies
import os
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

# load data
from datasets import load_dataset

import warnings
import re
warnings.filterwarnings('ignore')

2023-11-24 18:39:18.186626: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


----
# KerasNLP Training Approach

### Using the KerasNLP API to train transformers

In [2]:
# import 'GLUE' dataset: General Language Understanding Evaluation benchmark
# import 'CoLA' dataset: Corpus of Linguistic Acceptability
# documentation: https://huggingface.co/datasets/glue

dataset = load_dataset('glue', 'cola')
type(dataset)

datasets.dataset_dict.DatasetDict

In [3]:
# inspect the training data

dataset["train"][0:20]

{'sentence': ["Our friends won't buy this analysis, let alone the next one we propose.",
  "One more pseudo generalization and I'm giving up.",
  "One more pseudo generalization or I'm giving up.",
  'The more we study verbs, the crazier they get.',
  'Day by day the facts are getting murkier.',
  "I'll fix you a drink.",
  'Fred watered the plants flat.',
  'Bill coughed his way out of the restaurant.',
  "We're dancing the night away.",
  'Herman hammered the metal flat.',
  'The critics laughed the play off the stage.',
  'The pond froze solid.',
  'Bill rolled out of the room.',
  'The gardener watered the flowers flat.',
  'The gardener watered the flowers.',
  'Bill broke the bathtub into pieces.',
  'Bill broke the bathtub.',
  'They drank the pub dry.',
  'They drank the pub.',
  'The professor talked us into a stupor.'],
 'label': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1],
 'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}

In [4]:
# only taking the training split

dataset = dataset["train"]

In [5]:
'''loading a tokenizing the data as NumPy arrays
    Notes: 
        - labels consists of lists of 1s and 0s
        - this can be converted to a NumPy array w.o. tokenizing'''

'loading a tokenizing the data as NumPy arrays\n    Notes: \n        - labels consists of lists of 1s and 0s\n        - this can be converted to a NumPy array w.o. tokenizing'

In [6]:
# initialize the tokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [7]:
'''
setting tokenizer parameters
    Args:
        dataset["sentence"]: The text data to be tokenized, where each element is a string.
        return_tensors="np": Specifies the output format as NumPy arrays.
        padding=True: Ensures all tokenized outputs are of the same length by adding padding.
'''

tokenized_data = tokenizer(
    dataset["sentence"], 
    return_tensors="np", 
    padding=True)

In [8]:
'''
- Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
- "BatchEncoding" here neatly organizes all the information. 
- We can think of it like a container or a table where each row represents a piece of text from your batch, and each column contains the different pieces of information (like token IDs, attention masks) for that text.
'''

tokenized_data = dict(tokenized_data)

In [9]:
# label field is already an array of 0s and 1s
# convert the labels to a NumPy array

labels = np.array(dataset["label"])  

In [10]:
# now we load, compile, and fit the model

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# lower learning rates are often better for fine-tuning transformers

model.compile(optimizer=Adam(3e-5))

In [12]:
# fit the model to the tokenized data

model.fit(tokenized_data, labels)



<keras.src.callbacks.History at 0x1679d9810>