In [2]:
import pandas as pd

In [3]:
from tqdm import tqdm
tqdm.pandas()

In [20]:
df = pd.read_csv('res.csv')

In [24]:
df = df.dropna(subset=['description'])

In [25]:
df['description'][9380]

KeyError: 9380

In [26]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

data = df  
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

def summarize_text(text, num_sentences=4):
    if pd.isna(text):
        return np.nan
    
    sentences = sent_tokenize(text, language='russian')
    
    if len(sentences) < num_sentences:
        return text
    
    sentence_embeddings = model.encode(sentences)
    
    similarity_matrix = cosine_similarity(sentence_embeddings)
    sentence_scores = similarity_matrix.sum(axis=1)
    
    top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
    top_sentence_indices = sorted(top_sentence_indices)
    
    summary_sentences = [sentences[i].strip() for i in top_sentence_indices if sentences[i].strip()]
    summary = " ".join(summary_sentences)
    
    return summary

tqdm.pandas()

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to
[nltk_data]     /home/maulen_auth0_auth0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
df['extr'] = df['description'].progress_apply(summarize_text)

100%|██████████| 63747/63747 [02:06<00:00, 504.85it/s] 


In [28]:
df['class'].value_counts().sort_index()

1      200
2      198
3      200
4      200
5      198
      ... 
396    200
397    200
398    200
399    200
400    200
Name: class, Length: 324, dtype: int64

In [29]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup as bs

def preprocess(sentence):
    soup = bs(sentence, features="html.parser")
    sentence = soup.get_text()
    soup = bs(sentence, features="html.parser")
    sentence = soup.get_text()

    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace('{html}',"")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url = re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if not w in stopwords.words('russian')]

    return " ".join(filtered_words)

In [30]:
df['extr'] = df['extr'].apply(lambda x: preprocess(x))

  soup = bs(sentence, features="html.parser")
  soup = bs(sentence, features="html.parser")


In [3]:
import pandas as pd
df = pd.read_csv('train_extr.csv')

In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['class'] = encoder.fit_transform(df['class'])

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [9]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=327, output_attentions=True, output_hidden_states=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [8]:
def tokenize_function(description, tokenizer, max_length=512):
    return tokenizer(description.tolist(), padding='max_length', truncation=True, max_length=max_length)

In [9]:
from sklearn.model_selection import train_test_split

X = df['extr']
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [10]:
train_encodings = tokenize_function(X_train, tokenizer)
test_encodings = tokenize_function(X_test, tokenizer)

In [11]:
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

In [12]:
from torch.utils.data import Dataset, DataLoader

class JobDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JobDataset(train_encodings, y_train)
test_dataset = JobDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [12]:
from torch.optim import AdamW

In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [4]:
def save_checkpoint(model, optimizer, epoch, path="checkpoint.pth"):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }
    torch.save(checkpoint, path)
    print(f"Сохранен snapshot на эпохе {epoch} в {path}")

In [5]:
def load_checkpoint(path, model, optimizer):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Загружен snapshot с {epoch}-й эпохи")
    return epoch

In [15]:
from tqdm import tqdm

def train_model(model, train_loader, test_loader, optimizer, device, num_epochs=10, accumulation_steps=4):
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        train_loader_tqdm = tqdm(train_loader, desc=f"Эпоха {epoch+1}/{num_epochs} - Тренировка", leave=False)

        optimizer.zero_grad()  

        for i, batch in enumerate(train_loader_tqdm):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss / accumulation_steps  
            loss.backward()  
            
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()  
                optimizer.zero_grad()  

            total_train_loss += loss.item() * accumulation_steps  

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        total_val_loss = 0

        test_loader_tqdm = tqdm(test_loader, desc=f"Эпоха {epoch+1}/{num_epochs} - Валидация", leave=False)
        with torch.no_grad():
            for batch in test_loader_tqdm:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)
        val_losses.append(avg_val_loss)

        print(f"Эпоха {epoch+1}/{num_epochs} | Тренировочная потеря: {avg_train_loss:.4f} | Валид потеря: {avg_val_loss:.4f}")

    return train_losses, val_losses

In [27]:
from tqdm import tqdm

def train_model(model, train_loader, test_loader, optimizer, device, num_epochs=327, gradient_accumulation_steps=4, save_path="checkpoint.pth"):
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        optimizer.zero_grad()  

        train_loader_tqdm = tqdm(train_loader, desc=f"Эпоха {epoch+1}/{num_epochs} - Тренировка", leave=False)

        for step, batch in enumerate(train_loader_tqdm):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps  
            total_train_loss += loss.item()

            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()  

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        total_val_loss = 0

        test_loader_tqdm = tqdm(test_loader, desc=f"Эпоха {epoch+1}/{num_epochs} - Валидация", leave=False)
        with torch.no_grad():
            for batch in test_loader_tqdm:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)
        val_losses.append(avg_val_loss)

        print(f"Эпоха {epoch+1}/{num_epochs} | Тренировочная потеря: {avg_train_loss:.4f} | Валид потеря: {avg_val_loss:.4f}")
        
        save_checkpoint(model, optimizer, epoch+1, path=f"{save_path}_epoch_{epoch+1}.pth")

    return train_losses, val_losses

In [28]:
train_losses, val_losses = train_model(model, train_loader, test_loader, optimizer, device)

                                                                              

Эпоха 1/12 | Тренировочная потеря: 0.4377 | Валид потеря: 1.3875
Сохранен snapshot на эпохе 1 в checkpoint.pth_epoch_1.pth


                                                                              

Эпоха 2/12 | Тренировочная потеря: 0.2778 | Валид потеря: 1.1117
Сохранен snapshot на эпохе 2 в checkpoint.pth_epoch_2.pth


                                                                              

Эпоха 3/12 | Тренировочная потеря: 0.2049 | Валид потеря: 1.0285


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 1386434560 vs 1386434452

In [24]:
save_directory = "../../snapshoot/model.pth"

In [35]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

model_path = 'checkpoint.pth_epoch_2.pth'

model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=327)  
optimizer = AdamW(model.parameters(), lr=5e-5)  

def load_checkpoint(path, model, optimizer):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Загружен snapshot с {epoch}-й эпохи")
    return epoch

try:
    epoch = load_checkpoint(model_path, model, optimizer)
    model.eval()  
except Exception as e:
    print(f"Ошибка при загрузке контрольной точки: {e}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load(path)


Загружен snapshot с 2-й эпохи
