## Imports

In [17]:
import transformers
print(transformers.__version__)

4.51.2


In [32]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [19]:
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
NUM_CLASSES = 5  # 5 классов: Politics 0, Sport 1, Technology 2, Entertainment 3, Business 4

## Dataset

In [20]:
df = pd.read_csv("./dataset/df_file.csv")
df.columns = ["text", "label"]

In [21]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print(train_df.shape, test_df.shape)

(1780, 2) (445, 2)


In [22]:
model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [24]:
train_dataset = TextDataset(
    train_df['text'].values,
    train_df['label'].values,
    tokenizer,
    MAX_LENGTH
)

test_dataset = TextDataset(
    test_df['text'].values,
    test_df['label'].values,
    tokenizer,
    MAX_LENGTH
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## Model

In [25]:
base_model = AutoModel.from_pretrained(model_name)

In [26]:
class BertForClassification(nn.Module):
    def __init__(self, base_model, num_classes):
        super().__init__()
        self.bert = base_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

model = BertForClassification(base_model, NUM_CLASSES)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [27]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(classification_report(true_labels, predictions, target_names=['Politics', 'Sport', 'Technology', 'Entertainment', 'Business']))

  0%|          | 0/28 [00:00<?, ?it/s]

               precision    recall  f1-score   support

     Politics       0.00      0.00      0.00        84
        Sport       0.08      0.14      0.10       102
   Technology       0.00      0.00      0.00        80
Entertainment       0.21      0.60      0.32        77
     Business       0.14      0.07      0.09       102

     accuracy                           0.15       445
    macro avg       0.09      0.16      0.10       445
 weighted avg       0.09      0.15      0.10       445



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Training

In [28]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

In [29]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}")

  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 1/3, Train Loss: 0.4192


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 2/3, Train Loss: 0.0463


  0%|          | 0/112 [00:00<?, ?it/s]

Epoch 3/3, Train Loss: 0.0237


In [31]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(classification_report(true_labels, predictions, target_names=['Politics', 'Sport', 'Technology', 'Entertainment', 'Business']))

  0%|          | 0/28 [00:00<?, ?it/s]

               precision    recall  f1-score   support

     Politics       0.99      0.99      0.99        84
        Sport       0.99      0.99      0.99       102
   Technology       0.92      0.97      0.95        80
Entertainment       0.97      0.97      0.97        77
     Business       0.97      0.92      0.94       102

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



               precision    recall  f1-score   support

    Politics       0.99      0.99      0.99        84
    Sport          0.99      0.99      0.99       102
    Technology     0.92      0.97      0.95        80
    Entertainment  0.97      0.97      0.97        77
    Business       0.97      0.92      0.94       102

    accuracy                           0.97       445
    macro avg      0.97      0.97      0.97       445
    weighted avg   0.97      0.97      0.97       445