In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spam.csv", encoding="latin1")

# For some reasons, the column names must be "text" and "label".
dataset = pd.DataFrame(raw, columns=["v1","v2"]).rename(columns={"v1": "label", "v2": "text"})
dataset["label"] = dataset["label"].map({"ham": 0, "spam": 1})

train, test = train_test_split(dataset, test_size=.25, random_state=42)

In [None]:
# import torch
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import Dataset

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = Dataset.from_pandas(train)
test_set = Dataset.from_pandas(test)

train_set = train_set.map(
    lambda row : tokenizer(row["text"], truncation=True, padding="max_length", max_length=128),
    batched=True
)
test_set = test_set.map(
    lambda row : tokenizer(row["text"], truncation=True, padding="max_length", max_length=128),
    batched=True
)

In [None]:
# import torch
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import Dataset
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support)
import numpy as np

def compute_metrics(pred):
  preds = np.argmax(pred.predictions, axis=1)
  labels = pred.label_ids

  accuracy = accuracy_score(labels, preds)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
  }

train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir="/content/drive/MyDrive/Colab Notebooks/log",
    save_total_limit=1,
    report_to="none",
    learning_rate=1e-5,
)

trainer = Trainer(
    model=roberta,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

Eval phase

In [None]:
from transformers import (RobertaTokenizer, RobertaForSequenceClassification)
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
# from datasets import Dataset
# from torch.utils.data import DataLoader, TensorDataset

checkpoint = "/content/drive/MyDrive/Colab Notebooks/results/checkpoint-1046"
trained_roberta = RobertaForSequenceClassification.from_pretrained(checkpoint)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
trained_roberta.eval()

# raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spam.csv", encoding="latin1")
# dataset = pd.DataFrame(raw, columns=["v1","v2"]).rename(columns={"v1": "label", "v2": "text"})
raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SPAM text message 20170820 - Data.csv", encoding="latin1")
dataset = pd.DataFrame(raw, columns=["Category","Message"]).rename(columns={"Category": "label", "Message": "text"})
dataset["label"] = dataset["label"].map({"ham": 0, "spam": 1})
train, test = train_test_split(dataset, test_size=.16, random_state=42)

# inputs = Dataset.from_pandas(raw_dataset)
# inputs = inputs.map(
#     lambda row : tokenizer(row["text"], truncation=True, padding="max_length", max_length=128),
#     batched=True
# )

# dataset = TensorDataset(inputs["inputs_id"], inputs["attention_mask"], torch.tensor(raw_dataset["label"]))
# dataloader = DataLoader(dataset, batch_size=32)

# predictions = []

inputs = tokenizer(test["text"].tolist(), truncation=True, padding=True, return_tensors="pt")
with torch.no_grad():
    outputs = trained_roberta(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

conf_mat = confusion_matrix(test["label"], predictions.numpy())

ConfusionMatrixDisplay(confusion_matrix=conf_mat).plot(values_format="d", cmap="Blues")
plt.show()

Prediction

In [None]:
from transformers import (RobertaTokenizer, RobertaForSequenceClassification)
import torch

checkpoint = "/content/drive/MyDrive/Colab Notebooks/results/checkpoint-1046"
trained_roberta = RobertaForSequenceClassification.from_pretrained(checkpoint)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
trained_roberta.eval()

def predict(msg:str):
  input = tokenizer(msg, truncation=True, padding=True, return_tensorsu"pt")
  with torch.no_grad():
    output = trained_roberta(**input)
  proba = torch.softmax(output.logits, dim=1)
  prediction = torch.argmax(proba).item()
  return prediction, proba.squeeze().tolist()

In [None]:
ham_test = "K I'll call you when I'm close"
spam_test = "Want to funk up ur fone with a weekly new tone reply TONES2U 2 this text. www.ringtones.co.uk, the original n best. Tones 3GBP network operator rates apply"
pred, prob = predict(spam_test)
print({"Prediction": pred, "Probability": prob})

# ROC Curve

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from torch.nn.functional import softmax

def get_probabilities(model, tokenizer, texts):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for t in texts:
            inputs = tokenizer(t, return_tensors="pt", truncation=True)
            outputs = model(**inputs)
            logits = outputs.logits
            probs = softmax(logits, dim=1)
            all_probs.append(probs[0,1].item())
    return all_probs


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd
from transformers import (RobertaTokenizer, RobertaForSequenceClassification)

checkpoint = "/content/drive/MyDrive/Colab Notebooks/results/checkpoint-1046"
trained_roberta = RobertaForSequenceClassification.from_pretrained(checkpoint)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spam.csv", encoding="latin1")
dataset = pd.DataFrame(raw, columns=["v1","v2"]).rename(columns={"v1": "label", "v2": "text"})
# raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SPAM text message 20170820 - Data.csv", encoding="latin1")
# dataset = pd.DataFrame(raw, columns=["Category","Message"]).rename(columns={"Category": "label", "Message": "text"})
dataset["label"] = dataset["label"].map({"ham": 0, "spam": 1})
y_true = dataset["label"].tolist()

roberta_probs = get_probabilities(trained_roberta, tokenizer, dataset["text"].tolist())

fpr_r, tpr_r, _ = roc_curve(y_true, roberta_probs)
roc_auc_r = auc(fpr_r, tpr_r)

In [None]:
plt.figure(figsize=(8,6))

plt.plot(fpr_r, tpr_r, label=f"RoBERTa (AUC = {roc_auc_r:.4f})")

plt.plot([0,1], [0,1], linestyle="--")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("RoBERTa - Same Dataset")
plt.legend(loc="lower right")

plt.show()