In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import re

In [7]:
# Функция для очистки текста
def clean_text(text):
    # Удаляем все, кроме букв и цифр
    cleaned_text = re.sub(r'[^а-яА-Яa-zA-Z0-9]', ' ', text)
    # Приводим текст к нижнему регистру
    cleaned_text = cleaned_text.lower()
    return cleaned_text

In [8]:
# Загрузка данных
data = pd.read_csv('sample.csv')
labels_dict = {'proxy': 0, 'contract': 1, 'act': 2, 'application': 3, 'order': 4, 
               'invoice': 5, 'bill': 6, 'arrangement': 7, 'contract offer': 8, 
               'statute': 9, 'determination': 10}
data['labels'] = data['class'].map(labels_dict)

# Разбиение данных
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'].tolist(), data['labels'].tolist(), test_size=0.1)

In [9]:
# Токенизация и подготовка данных
model_name = "DeepPavlov/rubert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def prepare_data(texts):
    # Очистка текстов
    cleaned_texts = [clean_text(text) for text in texts]
    # Токенизация и подготовка входных данных для модели
    inputs = tokenizer(cleaned_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    return inputs


class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label

train_dataset = TextDataset(train_texts, train_labels)
test_dataset = TextDataset(test_texts, test_labels)

def collate_batch(batch):
    texts, labels = zip(*batch)
    labels = torch.tensor(labels)
    inputs = prepare_data(list(texts))  # Преобразование текстов в список, если это необходимо
    return inputs, labels

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_batch)

test_dataset = TextDataset(test_texts, test_labels)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [12]:
# Модель
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=11)
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Обучение
model.train()
total_loss = 0
for epoch in range(10):  # Количество эпох
    for inputs, labels in train_loader:
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)
        model.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")
    total_loss = 0
    # Сохраняем полную модель
    torch.save(model, f'models/full_model_epoch_{epoch+1}.pt')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not SequenceClassifierOutput