In [1]:
!pip install evaluate
!pip install datasets
!pip install accelerate>=0.20.1
!pip install transformers[torch]



In [2]:
from datasets import load_dataset

dataset = load_dataset("FredZhang7/all-scam-spam")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'is_spam'],
        num_rows: 42619
    })
})

In [4]:
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()
# dataset['train'] = dataset['train'].add_column('label', encoder.fit_transform(dataset['train']['text']))
# dataset['train'] = dataset['train'].remove_columns('text')

In [5]:
dataset['train'] = dataset['train'].rename_column('is_spam','label')

In [6]:
dataset = dataset['train'].train_test_split(test_size=0.2)

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert-tiny-finetuned-sms-spam-detection")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/34095 [00:00<?, ? examples/s]

Map:   0%|          | 0/8524 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/bert-tiny-finetuned-sms-spam-detection", num_labels=2)

In [10]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=5e-5,
    evaluation_strategy="epoch",
    per_device_train_batch_size=64,  # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    metric_for_best_model="f1",
)

In [11]:
import numpy as np
import evaluate
metric = evaluate.load("f1")

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [14]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.2699,0.140763,0.94024
2,0.1261,0.113504,0.952608
3,0.0956,0.097696,0.957607


TrainOutput(global_step=1599, training_loss=0.15976688726161553, metrics={'train_runtime': 1520.9074, 'train_samples_per_second': 67.253, 'train_steps_per_second': 1.051, 'total_flos': 32487941721600.0, 'train_loss': 0.15976688726161553, 'epoch': 3.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.09769579023122787,
 'eval_f1': 0.9576066921877215,
 'eval_runtime': 36.9745,
 'eval_samples_per_second': 230.537,
 'eval_steps_per_second': 3.624,
 'epoch': 3.0}

In [16]:
predictions = trainer.predict(tokenized_datasets['test'])
logits, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(logits, axis=-1)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5016
           1       0.95      0.96      0.96      3508

    accuracy                           0.96      8524
   macro avg       0.96      0.96      0.96      8524
weighted avg       0.97      0.96      0.96      8524

