In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

raw = pd.read_csv("spam.csv", encoding="latin1")

# For some reasons, the column names must be "text" and "label".
dataset = pd.DataFrame(raw, columns=["v1","v2"]).rename(columns={"v1": "label", "v2": "text"})
dataset["label"] = dataset["label"].map({"ham": 0, "spam": 1})

train, test = train_test_split(dataset, test_size=.25, random_state=42)

In [5]:
# import torch
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import Dataset

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = Dataset.from_pandas(train)
test_set = Dataset.from_pandas(test)

train_set = train_set.map(
    lambda row : tokenizer(row["text"], truncation=True, padding="max_length", max_length=128), 
    batched=True
)
test_set = test_set.map(
    lambda row : tokenizer(row["text"], truncation=True, padding="max_length", max_length=128),
    batched=True
)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4179/4179 [00:01<00:00, 3559.63 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1393/1393 [00:00<00:00, 3562.72 examples/s]


In [6]:
train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./log",
    save_total_limit=1
)

trainer = Trainer(
    model=roberta,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 