In [1]:
%reload_ext autoreload
%autoreload 2

## Imports

In [2]:
import transformers
print(transformers.__version__)

4.51.2


In [3]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

In [4]:
model_name = "google-bert/bert-base-uncased"

In [5]:
config = BertConfig.from_pretrained(model_name, num_labels=5)
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification(config)  # случайная инициализация весов

In [6]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Load Dataset

In [7]:
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 2e-5
NUM_CLASSES = 5  # 5 классов: Politics 0, Sport 1, Technology 2, Entertainment 3, Business 4

In [8]:
df = pd.read_csv("../dataset/df_file.csv")
df.columns = ["text", "label"]

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print(train_df.shape, test_df.shape)

(1780, 2) (445, 2)


In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(
    train_df['text'].values,
    train_df['label'].values,
    tokenizer,
    MAX_LENGTH
)

test_dataset = TextDataset(
    test_df['text'].values,
    test_df['label'].values,
    tokenizer,
    MAX_LENGTH
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## Training

In [10]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [12]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}")

  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 1/10, Train Loss: 1.6352


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 2/10, Train Loss: 1.6353


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 3/10, Train Loss: 1.6101


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 4/10, Train Loss: 1.1541


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 5/10, Train Loss: 0.4204


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 6/10, Train Loss: 0.1539


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 7/10, Train Loss: 0.0789


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 8/10, Train Loss: 0.0424


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 9/10, Train Loss: 0.0410


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 10/10, Train Loss: 0.0899


In [13]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(classification_report(
    true_labels,
    predictions,
    target_names=['Politics', 'Sport', 'Technology', 'Entertainment', 'Business']
))

  0%|          | 0/28 [00:00<?, ?it/s]

               precision    recall  f1-score   support

     Politics       0.78      0.99      0.87        84
        Sport       0.94      0.99      0.97       102
   Technology       0.99      0.86      0.92        80
Entertainment       0.86      0.77      0.81        77
     Business       0.96      0.86      0.91       102

     accuracy                           0.90       445
    macro avg       0.90      0.89      0.89       445
 weighted avg       0.91      0.90      0.90       445



               precision    recall  f1-score   support

     Politics       0.78      0.99      0.87        84
     Sport          0.94      0.99      0.97       102
     Technology     0.99      0.86      0.92        80
     Entertainment  0.86      0.77      0.81        77
     Business       0.96      0.86      0.91       102

     accuracy                           0.90       445
     macro avg      0.90      0.89      0.89       445
     weighted avg   0.91      0.90      0.90       445