In [46]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments
from transformers import Trainer

In [47]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [48]:
train_file_path = "data/train-data.tsv"
test_file_path = "data/valid-data.tsv"

with open(train_file_path) as f:
    train_data = pd.read_csv(f, sep='\t', header=None)

with open(test_file_path) as f:
    test_data = pd.read_csv(f, sep='\t', header=None)

In [64]:
class_map = {'ham':0, 'spam':1}

train_dataset = [tokenizer(a) for a in train_data[1]]
for a,b in zip(train_dataset, train_data[0].map(class_map)):
     a['label'] = b

eval_dataset = [tokenizer(a) for a in test_data[1]]
for a,b in zip(train_dataset, test_data[0].map(class_map)):
     a['label'] = b

In [50]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [71]:
train_dataset[0]

{'input_ids': [101, 6289, 23644, 2232, 1012, 1012, 1012, 2074, 22795, 2039, 999, 2018, 1037, 2919, 3959, 2055, 1057, 27793, 1010, 2061, 1045, 2123, 2102, 2066, 1057, 2157, 2085, 1024, 1007, 1045, 2134, 2102, 2113, 2505, 2055, 4038, 2305, 2021, 1045, 3984, 10047, 2039, 2005, 2009, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0}

In [80]:
BATCH_SIZE = 1
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    collate_fn = data_collator
)

eval_dataloader = DataLoader(
    eval_dataset,
    batch_size=BATCH_SIZE,
    collate_fn = data_collator
)

In [81]:
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

4179


In [82]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/4179 [01:53<?, ?it/s]
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  6289, 23644,  2232,  1012,  1012,  1012,  2074, 22795,  2039,
           999,  2018,  1037,  2919,  3959,  2055,  1057, 27793,  1010,  2061,
          1045,  2123,  2102,  2066,  1057,  2157,  2085,  1024,  1007,  1045,
          2134,  2102,  2113,  2505,  2055,  4038,  2305,  2021,  1045,  3984,
         10047,  2039,  2005,  2009,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([0])}
SequenceClassifierOutput(loss=tensor(0.6656, grad_fn=<NllLossBackward0>), logits=tensor([[-0.1136, -0.1694]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


  0%|          | 1/4179 [00:06<7:18:25,  6.30s/it]

{'input_ids': tensor([[ 101, 2017, 2064, 2196, 2079, 2498,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([0])}
SequenceClassifierOutput(loss=tensor(0.5469, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1609, -0.1567]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


  0%|          | 2/4179 [00:08<4:15:23,  3.67s/it]

{'input_ids': tensor([[  101,  2085,  1057,  2614,  2066,  2158,  4801,  8040, 15441,  2879,
          3889,  1010,  2066,   999,  1045,  2003,  8932,  2006,  4830,  3902,
          2188,  1012, 24185,  2102,  2038,  1057,  1999, 23356,  1018,  8640,
          4487,  2015,  6574,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([0])}
SequenceClassifierOutput(loss=tensor(0.2974, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4250, -0.6355]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


  0%|          | 3/4179 [00:13<4:58:31,  4.29s/it]

{'input_ids': tensor([[  101, 12954,  2360,  2057, 14071,  2000,  2175,  2059,  2175,  1012,
          1012,  1012,  2059,  2016,  2064, 18454,  2078, 12170,  2319,  3422,
          4830,  3221,  4538,  1012,  1012,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]]), 'labels': tensor([0])}
SequenceClassifierOutput(loss=tensor(0.1886, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3444, -1.2278]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


  0%|          | 4/4179 [00:17<5:00:34,  4.32s/it]

{'input_ids': tensor([[  101,  2196,  1061, 26947,  1012,  1012,  1012,  1045,  1058, 13971,
          1012,  1012,  1012,  2288, 28194,  1029, 23755,  2154,  1037, 29664,
          4604,  2033,  4830, 24471,  2140,  2064,  2102,  2147,  2028,  1012,
          1012,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([0])}


KeyboardInterrupt: 

In [None]:
metric = torch.nn.BCELoss()
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

AttributeError: 'BCELoss' object has no attribute 'add_batch'