In [1]:
!pip install torch transformers huggingface_hub datasets evaluate accelerate bitsandbytes > /dev/null

In [2]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader
import torch
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [3]:
mnli_dataset = load_dataset('glue', 'mnli')
# The dataset contains whether the sentence entails, contradicts or is
# unrelated to a given hypothesis

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [4]:
mnli_dataset['validation_matched'][:5]

{'premise': ['The new rights are nice enough',
  'This site includes a list of all award winners and a searchable database of Government Executive articles.',
  "uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him",
  "yeah i i think my favorite restaurant is always been the one closest  you know the closest as long as it's it meets the minimum criteria you know of good food",
  "i don't know um do you do a lot of camping"],
 'hypothesis': ['Everyone really likes the newest benefits ',
  'The Government Executive articles housed on the website are not able to be searched.',
  'I like him for the most part, but would still enjoy seeing someone beat him.',
  'My favorite restaurants are always at least a hundred miles away from my house. ',
  'I know exactly.'],
 'label': [1, 2, 0, 2, 2],
 'idx': [0, 1, 2, 3, 4]}

In [5]:
mnli_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [6]:
label_set = set(mnli_dataset['train']['label'])
label_set

{0, 1, 2}

In [7]:
# Check each of the dataset parts and find the unique labels
test_set = set(mnli_dataset['test_mismatched']['label'])
test_set

{-1}

In [8]:
mnli_dataset['test_matched'][:5]

{'premise': ['Hierbas, ans seco, ans dulce, and frigola are just a few names worth keeping a look-out for.',
  'The extent of the behavioral effects would depend in part on the structure of the individual account program and any limits on accessing the funds.',
  'Timely access to information is in the best interests of both GAO and the agencies.',
  'Based in the Auvergnat spa town of Vichy, the French government often proved more zealous than its masters in suppressing civil liberties and drawing up anti-Jewish legislation.',
  'Built in 1870, its canopy of stained glass and cast iron is the oldest in Dublin; its enthusiastic interior decoration is also typical of the era.'],
 'hypothesis': ['Hierbas is a name worth looking out for.',
  'Many people would be very unhappy to loose control over their own money.',
  "It is in everyone's best interest to have access to information in a timely manner.",
  'The French government passed anti-Jewish laws aimed at helping the Nazi.',
  'It wa

In [9]:
mnli_dataset['test_mismatched'][:5]

{'premise': ['What have you decided, what are you going to do?',
  "Women's clothing is characterized by great diversity in styles and short production runs.",
  'Reports from two flight attendants in the coach cabin, Betty Ong and Madeline Amy Sweeney, tell us most of what we know about how the hijacking happened.',
  'At about 9:20, security personnel at FAA headquarters set up a hijacking teleconference with several agencies, including the Defense Department.',
  " So we've got a couple of aircraft up there that have those instructions at this present time?"],
 'hypothesis': ["So what's your decision?",
  "Men's clothing typically has the most stylistic diversity unlike the blandness of women's fashion.",
  'The report on the hijacking was over five hundred pages long.',
  'The teleconference lasted for 13 straight hours.',
  "At the present time, there weren't any aircraft in the air, right?"],
 'label': [-1, -1, -1, -1, -1],
 'idx': [0, 1, 2, 3, 4]}

In [10]:
mnli_dataset['validation_mismatched'][:5]

{'premise': ['Your contribution helped make it possible for us to provide our students with a quality education.',
  "The answer has nothing to do with their cause, however, but with the simple fact that dictionaries are not exercises in bi-unique substitutability; in other words, if one of the senses of run is `operate' (as in She runs an engine factory ), that does not make it valid to assume that one can substitute operate for run in We run in the marathon every year .  Although recognizing this as a shortcoming of dictionaries and assigning it arbitrarily to what, for lack of a better term, we might call the  genius  of the language, might seem trivial to the casual observer, it is a valid matter for concern in the realm of lexicology.",
  ' We serve a classic Tuscan meal that includes a Florentine terrine made with dick and chicken livers.',
  'A few months ago, Carl Newton and I wrote a letter asking you to consider a financial contribution to graduate Endodontics at Indiana Univ

In [11]:
# review the mrpc dataset
mrpc_dataset = load_dataset('glue', 'mrpc')
mrpc_dataset

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [12]:
mrpc_dataset.column_names

{'train': ['sentence1', 'sentence2', 'label', 'idx'],
 'validation': ['sentence1', 'sentence2', 'label', 'idx'],
 'test': ['sentence1', 'sentence2', 'label', 'idx']}

In [13]:
mrpc_dataset['train'][:5]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
  'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
  'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
  'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at 

In [14]:
model_cp = "nghuyong/ernie-2.0-large-en"

tokenizer = AutoTokenizer.from_pretrained(model_cp)

tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [15]:
from transformers.utils import logging

logging.set_verbosity_debug()

In [16]:
torch.cuda.is_available()

True

In [17]:
model_ernie = AutoModelForSequenceClassification.from_pretrained(model_cp,
                                                                 num_labels=2,)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--nghuyong--ernie-2.0-large-en/snapshots/b8a2e493b0640891b1d99fe2e7c1011db62d1afb/config.json
Model config ErnieConfig {
  "_name_or_path": "nghuyong/ernie-2.0-large-en",
  "architectures": [
    "ErnieModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 512,
  "model_type": "ernie",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "task_type_vocab_size": 3,
  "transformers_version": "4.40.2",
  "type_vocab_size": 4,
  "use_cache": true,
  "use_task_id": false,
  "vocab_size": 30522
}



pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--nghuyong--ernie-2.0-large-en/snapshots/b8a2e493b0640891b1d99fe2e7c1011db62d1afb/pytorch_model.bin
All model checkpoint weights were used when initializing ErnieForSequenceClassification.

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-large-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
model_ernie.to('cuda')

ErnieForSequenceClassification(
  (ernie): ErnieModel(
    (embeddings): ErnieEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(4, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ErnieEncoder(
      (layer): ModuleList(
        (0-23): 24 x ErnieLayer(
          (attention): ErnieAttention(
            (self): ErnieSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ErnieSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNo

In [19]:
token_sentence = tokenizer(mrpc_dataset['test'][0]['sentence1'] + mrpc_dataset['test'][0]['sentence2'], return_tensors='pt')

In [20]:
pred_class = model_ernie(**token_sentence)
pred_class

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [21]:
def concat_tokenize(example):
  sentence = example['sentence1'] + example['sentence2']
  tokened_stmt = tokenizer(sentence, return_tensors='pt').to('cuda')
  example['input_ids'] = tokened_stmt['input_ids'][0]
  # example['token_type_id'] = tokened_stmt['token_type_ids']
  example['attention_mask'] = tokened_stmt['attention_mask'][0]
  return example

In [22]:
mrpc_dataset = mrpc_dataset.map(concat_tokenize)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [23]:
mrpc_dataset = mrpc_dataset.remove_columns(['sentence1', 'sentence2', 'idx'])

In [24]:
type(mrpc_dataset['train'][0]['input_ids'])

list

In [25]:
metric = load_metric('glue', 'mrpc')

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [26]:
single_eval_test = metric.compute(predictions=[1],
               references=[1])

In [27]:
single_eval_test

{'accuracy': 1.0, 'f1': 1.0}

In [28]:
# instantiating trainer
# ensure to type each of the args, and try to ascertain the significance of it
# check the errors, if there is any mistakes in the arguments
train_args = TrainingArguments("/content/ernie_model",
                               evaluation_strategy="epoch",
                               num_train_epochs=2,
                               # save_strategy='epoch',
                               learning_rate=2e-5,
                               per_device_train_batch_size=8,
                               per_device_eval_batch_size=8,
                               weight_decay=0.01,
                               load_best_model_at_end=False,
                               # metric_for_best_model=metric,
                               push_to_hub=False,
                               report_to='none',
                               skip_memory_metrics=True)

PyTorch: setting up devices


In [29]:
import numpy as np

def compute_metric(eval_pred):
  pred, refs = eval_pred
  predictions = np.argmax(pred, axis=1)
  return metric.compute(predictions=predictions,
                        references=refs)

In [30]:
mrpc_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [31]:
trainer = Trainer(
    model_ernie,
    train_args,
    train_dataset=mrpc_dataset['train'],
    eval_dataset=mrpc_dataset['test'],
    compute_metrics=compute_metric,
    tokenizer=tokenizer
)
# https://github.com/huggingface/transformers/issues/22980

In [None]:
trainer.train()

Currently training with a batch size of: 8
***** Running training *****
  Num examples = 3,668
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 918
  Number of trainable parameters = 335,145,986


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.324693,0.870725,0.903505


***** Running Evaluation *****
  Num examples = 1725
  Batch size = 8
Saving model checkpoint to /content/ernie_model/checkpoint-500
Configuration saved in /content/ernie_model/checkpoint-500/config.json
Model weights saved in /content/ernie_model/checkpoint-500/model.safetensors
tokenizer config file saved in /content/ernie_model/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/ernie_model/checkpoint-500/special_tokens_map.json


In [None]:
from evaluate import evaluator

task_evaluator = evaluator("text-classification")

In [None]:
results = task_evaluator.compute(
    model_or_pipeline=model_ernie,
    data=mrpc_dataset['validation'],
    tokenizer=tokenizer,
    metric="accuracy",
    label_mapping={"LABEL_0": 0.0, "LABEL_1": 1.0},
    strategy="bootstrap",
    n_resamples=10,
    random_state=0
)