In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments
from transformers import Trainer
from transformers import get_scheduler
from tqdm.auto import tqdm
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [3]:
train_file_path = "data/train-data.tsv"
test_file_path = "data/valid-data.tsv"

with open(train_file_path) as f:
    train_data = pd.read_csv(f, sep='\t', header=None)

with open(test_file_path) as f:
    test_data = pd.read_csv(f, sep='\t', header=None)

In [4]:
class_map = {'ham':0, 'spam':1}

train_dataset = [tokenizer(a) for a in train_data[1]]
for a,b in zip(train_dataset, train_data[0].map(class_map)):
     a['label'] = b

eval_dataset = [tokenizer(a) for a in test_data[1]]
for a,b in zip(eval_dataset, test_data[0].map(class_map)):
     a['label'] = b

In [5]:
train_dataset = train_dataset[0:100]
eval_dataset = eval_dataset[0:100]

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [7]:
BATCH_SIZE = 1
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    collate_fn = data_collator
)

eval_dataloader = DataLoader(
    eval_dataset,
    batch_size=BATCH_SIZE,
    collate_fn = data_collator
)

In [8]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

100


In [11]:
accuracy_metric = evaluate.load("accuracy")
metric = accuracy_metric
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'accuracy': 0.97}

In [12]:
progress_bar = tqdm(range(num_training_steps))

if True:
    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

 99%|█████████▉| 99/100 [00:07<00:00, 14.62it/s]

In [13]:
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

{'accuracy': 0.99}

100%|██████████| 100/100 [00:20<00:00, 14.62it/s]

In [16]:
class_unmap = {0:'ham', 1:'spam'}

model.eval()
def predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    proba = torch.sigmoid(outputs.logits)[0][1].item()
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return [proba, class_unmap[prediction]]

In [17]:
predict('sale today! to stop texts call 98912460324')

SequenceClassifierOutput(loss=None, logits=tensor([[-1.3256,  1.4295]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


[0.8068269491195679, 'spam']

In [18]:
predict('yo lets go party today')

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4692, -1.7337]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


[0.15011438727378845, 'ham']

In [None]:
inputs = tokenizer('This is spam send me money card 2522 0333 0555 5555', return_tensors="pt")
output = model(**inputs)
prediction.item()

0.7810627222061157

In [None]:
inputs = tokenizer('SPAM SPAM SPAM', return_tensors="pt")
output = model(**inputs)
prediction = torch.sigmoid(output.logits)[0][0]
prediction.item()

0.7843726277351379

In [None]:
 test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]
                  
[predict(a) for a in test_messages]

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4304, -1.7011]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[-1.2804,  1.3891]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4414, -1.7043]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.3678, -1.6031]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.3611, -1.6001]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4392, -1.7121]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4386, -1.7096]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


[[0.15432465076446533, 'ham'],
 [0.8004430532455444, 'spam'],
 [0.15390124917030334, 'ham'],
 [0.1675494760274887, 'ham'],
 [0.16796663403511047, 'ham'],
 [0.15289618074893951, 'ham'],
 [0.15321694314479828, 'ham']]

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4304, -1.7011]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[-1.2804,  1.3891]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4414, -1.7043]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.3678, -1.6031]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.3611, -1.6001]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4392, -1.7121]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4386, -1.7096]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
You haven't passed yet. Keep trying.
