[script example](https://github.com/huggingface/transformers/blob/9a12b9696fca52c71601b59a73c8e18426519027/examples/text-classification/run_glue.py#L295)

In [1]:
%%capture
!pip install transformers datasets sklearn

In [2]:
# imports

from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# logging

import random
import logging

logger = logging.getLogger(__name__)

# Setup logging
logging.basicConfig(
  format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
  datefmt="%m/%d/%Y %H:%M:%S",
  level=logging.INFO,
)

In [4]:
# load dataset


dataset = load_dataset('imdb')

label_list = dataset["train"].features["label"].names
num_labels = len(label_list)

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [5]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [6]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

#helper tokenizer function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle().select(range(10000)) # smaller the size for test dataset to 10k 

# test
train_dataset = train_dataset.shuffle().select(range(1000)) # smaller the size for test dataset to 10k 
test_dataset = test_dataset.shuffle().select(range(500)) # smaller the size for test dataset to 10k 



# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

# train.rename_column_("label", "labels")


# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

# set format for pytorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

12/21/2020 21:01:01 - INFO - __main__ -   Sample 848 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 




In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [9]:
trainer.train()

  return torch.tensor(x, **format_kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.67392,0.628,0.715596,0.574939,0.947368
2,No log,0.324889,0.876,0.878906,0.849057,0.910931


TrainOutput(global_step=126, training_loss=0.601897466750372)

In [15]:
import os 

eval_result = trainer.evaluate(eval_dataset=test_dataset)

output_eval_file = os.path.join(".", f"eval_results.txt")


# writes eval result to file which can be accessed later
with open(output_eval_file, "w") as writer:
    logger.info(f"***** Eval results *****")
    for key, value in sorted(eval_result.items()):
        logger.info(f"  {key} = {value}")
        writer.write(f"{key} = {value}\n")

12/21/2020 21:04:35 - INFO - __main__ -   ***** Eval results *****
12/21/2020 21:04:35 - INFO - __main__ -     epoch = 2.0
12/21/2020 21:04:35 - INFO - __main__ -     eval_accuracy = 0.876
12/21/2020 21:04:35 - INFO - __main__ -     eval_f1 = 0.87890625
12/21/2020 21:04:35 - INFO - __main__ -     eval_loss = 0.32488879561424255
12/21/2020 21:04:35 - INFO - __main__ -     eval_precision = 0.8490566037735849
12/21/2020 21:04:35 - INFO - __main__ -     eval_recall = 0.9109311740890689


In [16]:
trainer.save_model()  # Saves the model


# Custom Training Script

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

In [36]:
from tqdm import tqdm
import numpy as np
 # Eval!

def compute_metrics(preds, labels):
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall
  }

logger.info("***** Running evaluation *****")
logger.info("  Num examples = %d", len(test_dataset))

eval_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()

    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        tmp_eval_loss, logits = outputs[:2]

        eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = labels.detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)

eval_loss = eval_loss / nb_eval_steps
preds = np.argmax(preds, axis=1)
result = compute_metrics(preds, out_label_ids)

output_eval_file = os.path.join(".", "eval_man_results.txt")
with open(output_eval_file, "w") as writer:
    logger.info("***** Eval results  *****")
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))


12/21/2020 21:23:44 - INFO - __main__ -   ***** Running evaluation *****
12/21/2020 21:23:44 - INFO - __main__ -     Num examples = 500
Evaluating: 100%|██████████| 16/16 [00:09<00:00,  1.74it/s]
12/21/2020 21:23:53 - INFO - __main__ -   ***** Eval results  *****
12/21/2020 21:23:53 - INFO - __main__ -     accuracy = 0.878
12/21/2020 21:23:53 - INFO - __main__ -     f1 = 0.8721174004192871
12/21/2020 21:23:53 - INFO - __main__ -     precision = 0.9043478260869565
12/21/2020 21:23:53 - INFO - __main__ -     recall = 0.8421052631578947
