In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])

Reusing dataset imdb (/home/jupyter/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [5]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
train_dataset

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-b6f1c1e206082448.arrow


Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
    num_rows: 25000
})

In [6]:
len(train_dataset)

25000

In [7]:
train_dataset.features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),
 'text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [8]:
print(f"👉Dataset len(dataset): {len(train_dataset)}")
print("\n👉First item 'dataset[0]':")
print(train_dataset[0]["input_ids"][:5], "...", train_dataset[0]["input_ids"][-5:])

👉Dataset len(dataset): 25000

👉First item 'dataset[0]':
[101, 22953, 2213, 4381, 2152] ... [0, 0, 0, 0, 0]


In [9]:
print("\n👉First label 'dataset[0]':", train_dataset[0]["label"])


👉First label 'dataset[0]': 1


In [10]:
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-db408e4f49022fc0.arrow


In [11]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
  acc = accuracy_score(labels, preds)
  return {
      "accuracy": acc,
      "f1": f1,
      "precision": precision,
      "recall": recall
  }

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [13]:
trainer.train()

Step,Training Loss
500,0.3795
1000,0.2579
1500,0.1968


TrainOutput(global_step=1563, training_loss=0.2758947715344371, metrics={'train_runtime': 831.0715, 'train_samples_per_second': 1.881, 'total_flos': 8408354150400000.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 62843, 'init_mem_gpu_alloc_delta': 439072256, 'init_mem_cpu_peaked_delta': 18306, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 473032, 'train_mem_gpu_alloc_delta': 1320721920, 'train_mem_cpu_peaked_delta': 95877369, 'train_mem_gpu_peaked_delta': 12996674560})

In [14]:
trainer.evaluate()

{'eval_loss': 0.1758674681186676,
 'eval_accuracy': 0.93704,
 'eval_f1': 0.9374254591715036,
 'eval_precision': 0.9317211948790897,
 'eval_recall': 0.9432,
 'eval_runtime': 254.4079,
 'eval_samples_per_second': 98.267,
 'epoch': 1.0,
 'eval_mem_cpu_alloc_delta': 511303,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 1557492,
 'eval_mem_gpu_peaked_delta': 2316674048}