In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SPAM text message 20170820 - Data.csv", encoding="latin1")

# For some reasons, the column names must be "text" and "label".
dataset = pd.DataFrame(raw, columns=["Category","Message"]).rename(columns={"Category": "label", "Message": "text"})
dataset["label"] = dataset["label"].map({"ham": 0, "spam": 1})

train, test = train_test_split(dataset, test_size=.25, random_state=42)

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from datasets import Dataset

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def tokenize(batch):
  return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

train_set = Dataset.from_pandas(train)
test_set = Dataset.from_pandas(test)

train_set = train_set.map(
    tokenize,
    batched=True
)
test_set = test_set.map(
    tokenize,
    batched=True
)

train_set = train_set.with_format("torch")
test_set = test_set.with_format("torch")

In [None]:
from transformers import (
    Trainer, TrainingArguments
)
from datasets import Dataset
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support)
import numpy as np

distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

def compute_metrics(pred):
  logits, labels = pred
  preds = np.argmax(logits, axis=-1)

  accuracy = accuracy_score(labels, preds)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
  }

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/results-distilBERT",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir="/content/drive/MyDrive/Colab Notebooks/log",
    save_total_limit=1,
    report_to="none",
    learning_rate=2e-5,
)

trainer = Trainer(
    model=distilbert,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

trainer.train()

# Eval Phase

In [None]:
from transformers import (DistilBertTokenizerFast, DistilBertForSequenceClassification)
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

checkpoint = "/content/drive/MyDrive/Colab Notebooks/results-distilBERT/checkpoint-1046"
trained_distilbert = DistilBertForSequenceClassification.from_pretrained(checkpoint)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
trained_distilbert.eval()

# raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spam.csv", encoding="latin1")
# dataset = pd.DataFrame(raw, columns=["v1","v2"]).rename(columns={"v1": "label", "v2": "text"})
raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SPAM text message 20170820 - Data.csv", encoding="latin1")
dataset = pd.DataFrame(raw, columns=["Category","Message"]).rename(columns={"Category": "label", "Message": "text"})
dataset["label"] = dataset["label"].map({"ham": 0, "spam": 1})
train, test = train_test_split(dataset, test_size=.16, random_state=42)

inputs = tokenizer(test["text"].tolist(), truncation=True, padding=True, return_tensors="pt")
with torch.no_grad():
    outputs = trained_distilbert(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

conf_mat = confusion_matrix(test["label"], predictions.numpy())

ConfusionMatrixDisplay(confusion_matrix=conf_mat).plot(values_format="d", cmap="Blues")
plt.show()

# Prediction

In [None]:
from transformers import (DistilBertTokenizerFast, DistilBertForSequenceClassification)
import torch

checkpoint = "/content/drive/MyDrive/Colab Notebooks/results-distilBERT/checkpoint-1046"
trained_distilbert = DistilBertForSequenceClassification.from_pretrained(checkpoint)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
trained_distilbert.eval()

def predict(msg:str):
  input = tokenizer(msg, truncation=True, padding=True, return_tensors="pt")
  with torch.no_grad():
    output = trained_distilbert(**input)
  proba = torch.softmax(output.logits, dim=1)
  prediction = torch.argmax(proba).item()
  return prediction, proba.squeeze().tolist()

In [None]:
ham_test = "I'm in a meeting, call me later at"
ham_2 = "Do �_ noe if ben is going?"
# What in the world?
ham_3 = "My slave! I want you to take 2 or 3 pictures of yourself today in bright light on your cell phone! Bright light!"


spam_test = "Want to funk up ur fone with a weekly new tone reply TONES2U 2 this text. www.ringtones.co.uk, the original n best. Tones 3GBP network operator rates apply"
spam_2 = "Urgent Please call 09066612661 from landline. �5000 cash or a luxury 4* Canary Islands Holiday await collection. T&Cs SAE award. 20M12AQ. 150ppm. 16+ ���"
pred, prob = predict(spam_2)
print({"Prediction": pred, "Probability": prob})

# ROC Curve

In [None]:
import torch
from torch.nn.functional import softmax

def get_probabilities(model, tokenizer, texts):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for t in texts:
            inputs = tokenizer(t, return_tensors="pt", truncation=True)
            outputs = model(**inputs)
            logits = outputs.logits
            probs = softmax(logits, dim=1)
            all_probs.append(probs[0,1].item())
    return all_probs

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd
from transformers import (DistilBertTokenizerFast, DistilBertForSequenceClassification)

checkpoint = "/content/drive/MyDrive/Colab Notebooks/results-distilBERT/checkpoint-1046"
trained_distilbert = DistilBertForSequenceClassification.from_pretrained(checkpoint)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spam.csv", encoding="latin1")
dataset = pd.DataFrame(raw, columns=["v1","v2"]).rename(columns={"v1": "label", "v2": "text"})
# raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SPAM text message 20170820 - Data.csv", encoding="latin1")
# dataset = pd.DataFrame(raw, columns=["Category","Message"]).rename(columns={"Category": "label", "Message": "text"})
dataset["label"] = dataset["label"].map({"ham": 0, "spam": 1})
y_true = dataset["label"].tolist()

distilbert_probs = get_probabilities(trained_distilbert, tokenizer, dataset["text"].tolist())

fpr_r, tpr_r, _ = roc_curve(y_true, distilbert_probs)
roc_auc_r = auc(fpr_r, tpr_r)

In [None]:
plt.figure(figsize=(8,6))

plt.plot(fpr_r, tpr_r, label=f"DistilBERT (AUC = {roc_auc_r:.4f})")

plt.plot([0,1], [0,1], linestyle="--")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("DistilBERT - Different Dataset")
plt.legend(loc="lower right")

plt.show()