In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split


In [2]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data = pd.DataFrame({'text_data': newsgroups.data, 'label': newsgroups.target})

entry_index = 0
print(f"Text:\n{newsgroups['data'][entry_index]}\n\n")
print(f"Label index: {newsgroups['target'][entry_index]}")
print(f"Label name: {newsgroups['target_names'][newsgroups['target'][entry_index]]}")

data = data.sample(frac=1).reset_index(drop=True)

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_seq_len = 128

def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    for index, row in tqdm(data.iterrows(), total=len(data)):
        encoded = tokenizer.encode_plus(
            row["text_data"],
            add_special_tokens=True,  
            max_length=max_seq_len,  
            padding="max_length",  
            truncation=True,  
            return_attention_mask=True, 
        )

        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
        labels.append(row["label"])

    # Convert lists to tensors
    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)

train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data, tokenizer, max_seq_len)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data, tokenizer, max_seq_len)

Text:


I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




Label index: 10
Label name: rec.sport.hockey


100%|██████████| 15076/15076 [00:54<00:00, 278.72it/s]
100%|██████████| 3770/3770 [00:15<00:00, 238.76it/s]


In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

# Create a TensorDataset object for the training set
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
# Use RandomSampler to shuffle the samples in the dataset
train_sampler = RandomSampler(train_dataset)
# Create DataLoader for the training set using dataset, sampler, and batch size
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Create a TensorDataset object for the validation set
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
# Use SequentialSampler to process the validation dataset sequentially
val_sampler = SequentialSampler(val_dataset)
# Create DataLoader for the validation set using dataset, sampler, and batch size
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

In [7]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=20,  
    output_attentions=False,  
    output_hidden_states=False, 
)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report

num_epochs = 3
total_steps = len(train_dataloader) * num_epochs

# Create the optimizer and scheduler for fine-tuning the model
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    # Use a progress bar during training
    progress_bar = tqdm(dataloader, desc="Training", position=0, leave=True)

    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_description(f"Training - Loss: {loss.item():.4f}")

    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0

    # Use a progress bar during evaluation
    progress_bar = tqdm(dataloader, desc="Evaluation", position=0, leave=True)

    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()

        batch_accuracy = accuracy_score(label_ids, logits.argmax(axis=-1))
        total_eval_accuracy += batch_accuracy

        progress_bar.set_description(f"Evaluation - Batch Accuracy: {batch_accuracy:.4f}")

    return total_eval_accuracy / len(dataloader)

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy = evaluate(model, val_dataloader, device)

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

Training - Loss: 0.7496: 100%|██████████| 943/943 [56:46<00:00,  3.61s/it]
Evaluation - Batch Accuracy: 0.8000: 100%|██████████| 236/236 [05:13<00:00,  1.33s/it]



Epoch 1/3
Loss: 1.4973 - Validation Accuracy: 0.6888


Training - Loss: 0.1821: 100%|██████████| 943/943 [57:08<00:00,  3.64s/it]
Evaluation - Batch Accuracy: 0.7000: 100%|██████████| 236/236 [05:18<00:00,  1.35s/it]



Epoch 2/3
Loss: 0.7804 - Validation Accuracy: 0.7201


Training - Loss: 0.5603: 100%|██████████| 943/943 [56:33<00:00,  3.60s/it]
Evaluation - Batch Accuracy: 0.7000: 100%|██████████| 236/236 [05:09<00:00,  1.31s/it]


Epoch 3/3
Loss: 0.5644 - Validation Accuracy: 0.7291





In [10]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def get_predictions(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()

        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    return np.array(predictions), np.array(true_labels)

predictions, true_labels = get_predictions(model, val_dataloader, device)

accuracy = accuracy_score(true_labels, predictions)

report = classification_report(true_labels, predictions, digits=4)

print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Evaluating: 100%|██████████| 236/236 [05:06<00:00,  1.30s/it]

Validation Accuracy: 0.7292
Classification Report:
              precision    recall  f1-score   support

           0     0.5126    0.6182    0.5604       165
           1     0.6869    0.7158    0.7010       190
           2     0.7225    0.6281    0.6720       199
           3     0.6262    0.6825    0.6532       189
           4     0.7485    0.6893    0.7176       177
           5     0.8315    0.8043    0.8177       184
           6     0.8408    0.8492    0.8450       199
           7     0.5464    0.7933    0.6471       208
           8     0.7202    0.7092    0.7147       196
           9     0.9382    0.8029    0.8653       208
          10     0.9505    0.9058    0.9276       191
          11     0.7811    0.8010    0.7909       196
          12     0.7333    0.6842    0.7079       209
          13     0.8558    0.8683    0.8620       205
          14     0.7861    0.7778    0.7819       189
          15     0.7238    0.7677    0.7451       198
          16     0.6552    0.6


