In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("yelp_review_full")
small_train = dataset["train"].shuffle(seed=42).select(range(1000))
small_test = dataset["test"].shuffle(seed=42).select(range(300))

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def preprocess(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

encoded_train = small_train.map(preprocess, batched=True)
encoded_test = small_test.map(preprocess, batched=True)

Map: 100%|██████████| 300/300 [00:00<00:00, 2712.69 examples/s]


In [5]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=10,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_test,
    compute_metrics=compute_metrics,
)


In [7]:
trainer.train()



Step,Training Loss
10,1.6516
20,1.627
30,1.6516
40,1.6096
50,1.6105
60,1.6185
70,1.6274
80,1.6242
90,1.5987
100,1.5996


TrainOutput(global_step=125, training_loss=1.591982208251953, metrics={'train_runtime': 183.0107, 'train_samples_per_second': 5.464, 'train_steps_per_second': 0.683, 'total_flos': 263118142464000.0, 'train_loss': 1.591982208251953, 'epoch': 1.0})