In [None]:
import wandb
wandb.login()

Load data from HF Hub

In [30]:
from datasets import load_dataset
dataset = load_dataset("yahoo_answers_topics")

In [31]:
dataset

In [32]:
dataset['train'].features['topic'].int2str(4)

In [33]:
label_list = dataset['train'].unique('topic')
label_list.sort()
label_list

In [34]:
num_labels = len(label_list)
num_labels

In [37]:
#dataset = dataset.rename_column('topic', 'labels')

Tokenizer essentially parses and converts the data. The tokenizer class contains the Vocabulary as well
Models have their custom tokenizers

In [36]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [38]:
sample_input = dataset['train'][0]['question_title']
sample_input

In [39]:
dataset = dataset.map(lambda x: tokenizer(x['question_title'], truncation=True), batched=True)

In [40]:
dataset['train'][0]

In [41]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

In [13]:
import torch

def get_topic(sentence, tokenize=tokenizer, model=model):
    # tokenize the input
    inputs = tokenizer(sentence, return_tensors='pt')
    # ensure model and inputs are on the same device (GPU)
    inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
    model = model.cuda()
    # get prediction - 10 classes "probabilities" (not really true because they still need to be normalized)
    with torch.no_grad():
        predictions = model(**inputs)[0].cpu().numpy()
    # get the top prediction class and convert it to its associated label
    top_prediction = predictions.argmax().item()
    return dataset['train'].features['labels'].int2str(top_prediction)

In [42]:
get_topic('Why is cheese so much better with wine?')

We use the trainer module

In [15]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    report_to = 'wandb',                     # enable logging to W&B
    output_dir = 'topic_classification',    # output directory
    overwrite_output_dir = True,
    evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
    learning_rate = 5e-5,                   # we can customize learning rate
    max_steps = 3000,
    logging_steps = 100,                    # we will log every 100 steps
    eval_steps = 500,                      # we will perform evaluation every 500 steps
    save_steps = 1000,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    run_name = 'custom_training'            # name of the W&B run
)

In [19]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # metrics from the datasets library have a `compute` method
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [20]:
trainer = Trainer(
    model = model,                  # model to be trained
    args = args,                    # training args
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,            # for padding batched data
    compute_metrics=compute_metrics # for custom metrics
)

In [43]:
trainer.evaluate()

In [44]:
trainer.train()

In [45]:
get_topic('Why is cheese so much better with wine?')

In [46]:
wandb.finish()