In [1]:
# dependencies
import os
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# load data
from datasets import load_dataset

import warnings
import re
warnings.filterwarnings('ignore')

----
# KerasNLP Training Approach

### Using the KerasNLP API to train transformers

In [2]:
# import 'GLUE' dataset: General Language Understanding Evaluation benchmark
# import 'CoLA' dataset: Corpus of Linguistic Acceptability
# documentation: https://huggingface.co/datasets/glue

dataset = load_dataset('glue', 'cola')
type(dataset)

datasets.dataset_dict.DatasetDict

In [4]:
# inspect the training data

dataset["train"][0:20]

{'sentence': ["Our friends won't buy this analysis, let alone the next one we propose.",
  "One more pseudo generalization and I'm giving up.",
  "One more pseudo generalization or I'm giving up.",
  'The more we study verbs, the crazier they get.',
  'Day by day the facts are getting murkier.',
  "I'll fix you a drink.",
  'Fred watered the plants flat.',
  'Bill coughed his way out of the restaurant.',
  "We're dancing the night away.",
  'Herman hammered the metal flat.',
  'The critics laughed the play off the stage.',
  'The pond froze solid.',
  'Bill rolled out of the room.',
  'The gardener watered the flowers flat.',
  'The gardener watered the flowers.',
  'Bill broke the bathtub into pieces.',
  'Bill broke the bathtub.',
  'They drank the pub dry.',
  'They drank the pub.',
  'The professor talked us into a stupor.'],
 'label': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1],
 'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}

In [5]:
# only taking the training split

dataset = dataset["train"]

In [None]:
'''loading a tokenizing the data as NumPy arrays
    Notes: 
        - labels consists of lists of 1s and 0s
        - this can be converted to a NumPy array w.o. tokenizing'''

In [6]:
# initialize the tokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# setting tokenizer parameters

tokenized_data = tokenizer(
    dataset["sentence"], 
    return_tensors="np", 
    padding=True)

In [None]:
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras

tokenized_data = dict(tokenized_data)

In [None]:
# label field is already an array of 0s and 1s
# convert the labels to a NumPy array

labels = np.array(dataset["label"])  