In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('dataset/df_3_final_realy_really.xlsx')

In [3]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['class'] = encoder.fit_transform(df['class'])

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [5]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=349, output_attentions=True, output_hidden_states=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [12]:
def tokenize_function(description, tokenizer, max_length=512):
    return tokenizer(description.tolist(), padding='max_length', truncation=True, max_length=max_length)

In [13]:
from sklearn.model_selection import train_test_split

X = df['description']
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [14]:
df = df.dropna(subset=['description'])

In [15]:
train_encodings = tokenize_function(X_train, tokenizer)
test_encodings = tokenize_function(X_test, tokenizer)

In [16]:
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

In [17]:
from torch.utils.data import Dataset, DataLoader

class JobDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JobDataset(train_encodings, y_train)
test_dataset = JobDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [18]:
from torch.optim import AdamW

In [19]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [20]:
def save_checkpoint(model, optimizer, epoch, path="checkpoint.pth"):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }
    torch.save(checkpoint, path)
    print(f"Сохранен snapshot на эпохе {epoch} в {path}")

In [21]:
def load_checkpoint(path, model, optimizer):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Загружен snapshot с {epoch}-й эпохи")
    return epoch

In [24]:
from tqdm import tqdm

def train_model(model, train_loader, test_loader, optimizer, device, num_epochs=10, gradient_accumulation_steps=4, save_path="checkpoint.pth"):
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        optimizer.zero_grad()  

        train_loader_tqdm = tqdm(train_loader, desc=f"Эпоха {epoch+1}/{num_epochs} - Тренировка", leave=False)

        for step, batch in enumerate(train_loader_tqdm):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps  
            total_train_loss += loss.item()

            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()  

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        total_val_loss = 0

        test_loader_tqdm = tqdm(test_loader, desc=f"Эпоха {epoch+1}/{num_epochs} - Валидация", leave=False)
        with torch.no_grad():
            for batch in test_loader_tqdm:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)
        val_losses.append(avg_val_loss)

        print(f"Эпоха {epoch+1}/{num_epochs} | Тренировочная потеря: {avg_train_loss:.4f} | Валид потеря: {avg_val_loss:.4f}")
        
        save_checkpoint(model, optimizer, epoch+1, path=f"{save_path}_epoch_{epoch+1}.pth")

    return train_losses, val_losses

In [None]:
train_losses, val_losses = train_model(model, train_loader, test_loader, optimizer, device)

                                                                              

Эпоха 1/10 | Тренировочная потеря: 0.3550 | Валид потеря: 1.2152
Сохранен snapshot на эпохе 1 в checkpoint.pth_epoch_1.pth


                                                                              

Эпоха 2/10 | Тренировочная потеря: 0.2205 | Валид потеря: 0.8810
Сохранен snapshot на эпохе 2 в checkpoint.pth_epoch_2.pth


                                                                              

Эпоха 3/10 | Тренировочная потеря: 0.1517 | Валид потеря: 0.7754
Сохранен snapshot на эпохе 3 в checkpoint.pth_epoch_3.pth


                                                                              

Эпоха 4/10 | Тренировочная потеря: 0.1199 | Валид потеря: 0.7486
Сохранен snapshot на эпохе 4 в checkpoint.pth_epoch_4.pth


                                                                              

Эпоха 5/10 | Тренировочная потеря: 0.1016 | Валид потеря: 0.7253
Сохранен snapshot на эпохе 5 в checkpoint.pth_epoch_5.pth


                                                                              

Эпоха 6/10 | Тренировочная потеря: 0.0893 | Валид потеря: 0.7220
Сохранен snapshot на эпохе 6 в checkpoint.pth_epoch_6.pth


                                                                              

Эпоха 7/10 | Тренировочная потеря: 0.0812 | Валид потеря: 0.7230
Сохранен snapshot на эпохе 7 в checkpoint.pth_epoch_7.pth


                                                                              

Эпоха 8/10 | Тренировочная потеря: 0.0753 | Валид потеря: 0.7127
Сохранен snapshot на эпохе 8 в checkpoint.pth_epoch_8.pth


Эпоха 9/10 - Тренировка:  86%|████████▋ | 6009/6948 [1:05:59<10:28,  1.50it/s]

In [None]:
start_epoch = load_checkpoint(model_path, model, optimizer)

additional_epochs = 2
total_epochs = start_epoch + additional_epochs

train_losses, val_losses = train_model(
    model,
    train_loader,
    test_loader,
    optimizer,
    device,
    num_epochs=total_epochs,
    save_path="checkpoint10.pth"
)


  checkpoint = torch.load(path)


Загружен snapshot с 9-й эпохи


Эпоха 1/11 - Тренировка:  92%|█████████▏| 6390/6948 [1:10:18<06:09,  1.51it/s]

In [132]:
model_path = 'checkpoint.pth_epoch_9.pth'

In [133]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='weighted')  
    recall = recall_score(all_labels, all_predictions, average='weighted')       
    f1 = f1_score(all_labels, all_predictions, average='weighted')                

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')



In [134]:
evaluate_model(model, test_loader, device)

Accuracy: 0.7996
Precision: 0.8039
Recall: 0.7996
F1-score: 0.7942


  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
df_test = pd.read_excel('dataset/45k_test.xlsx')

In [80]:
df_test

Unnamed: 0,id,name,description,date,to_be_classified,nkz_id,profession_id
0,12308,Преподаватель-учитель Турецкого языка,<p><strong>О нас: Образовательный центр ZEYN Q...,2023-11-09,True,,
1,12443,Ассистент Декана (Высшая школа образования),<ol> <li>содействие в планировании рабочего дн...,2023-10-24,True,,
2,12318,Администратор,"Обязанности: работа с пациентами, консультация...",2023-10-22,False,,
3,451830,Администратор торгового зала в автосалон,<p><strong>ASTER </strong>- первый автосуперма...,2023-05-24,False,,
4,12333,Администратор магазина,<p>Fix Price - международная сеть магазинов дл...,2023-11-22,False,,
...,...,...,...,...,...,...,...
45736,2717672,Преподаватель корейского языка,<p><strong>Обязанности:</strong></p><p>Обучени...,2024-03-22,True,,
45737,2717706,Оператор интернет-магазина,<p><b><b>Обязанности:</b></b></p><p><b>• Прием...,2024-03-21,True,,
45738,2717712,Администратор в Lounge bar,"<p><em>Приветствуем всех кандидатов , ищем в н...",2024-03-21,False,,
45739,2717729,Администратор учебного центра,<p>В PREP School мы помогаем подтянуть уровень...,2024-03-21,False,,


In [81]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup as bs

In [82]:
def preprocess(sentence):
    soup = bs(sentence, features="html.parser")
    sentence = soup.get_text()
    soup = bs(sentence, features="html.parser")
    sentence = soup.get_text()

    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace('{html}',"")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url = re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if not w in stopwords.words('russian')]

    return " ".join(filtered_words)

In [83]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [84]:
df_test['description'] = df_test['description'].progress_apply(lambda x: preprocess(x))

  0%|          | 0/45741 [00:00<?, ?it/s]

  soup = bs(sentence, features="html.parser")
  soup = bs(sentence, features="html.parser")


In [106]:
model_path = 'checkpoint.pth_epoch_9.pth'

In [107]:
from tqdm import tqdm
import torch
import torch.nn.functional as F
import pandas as pd


def tokenize_descriptions(descriptions, tokenizer, max_length=512):
    return tokenizer(descriptions.tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

encodings = tokenize_descriptions(df_test['description'], tokenizer)

class DescriptionDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

description_dataset = DescriptionDataset(encodings)
description_loader = DataLoader(description_dataset, batch_size=16, shuffle=False)

def predict(model, data_loader, device):
    model.eval()
    predictions = []
    probabilities = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Предсказания"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            probs = F.softmax(logits, dim=-1)
            
            predicted_classes = torch.argmax(probs, dim=-1)
            
            predictions.extend(predicted_classes.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())
    
    return predictions, probabilities
epoch = load_checkpoint(model_path, model, optimizer)
predicted_classes, predicted_probabilities = predict(model, description_loader, device)

df_test['predicted_class'] = predicted_classes
df_test['predicted_probability'] = [max(prob) for prob in predicted_probabilities]

  checkpoint = torch.load(path)


Загружен snapshot с 9-й эпохи


Предсказания: 100%|██████████| 2859/2859 [21:06<00:00,  2.26it/s]


In [111]:
sample = df_test.groupby(['predicted_class']).mean('predicted_probability')

In [112]:
sample.to_excel('test.xlsx')

In [105]:
df_test[df_test['predicted_class']==332]

Unnamed: 0,id,name,description,date,to_be_classified,nkz_id,profession_id,predicted_class,predicted_probability


In [114]:
df_test[df_test['predicted_class']==311]

Unnamed: 0,id,name,description,date,to_be_classified,nkz_id,profession_id,predicted_class,predicted_probability
40094,1471811,Системный администратор (интернет картография),обязанности поддержка картографических служб б...,2019-01-15,False,,,311,0.913054


In [113]:
df_test.head(2)

Unnamed: 0,id,name,description,date,to_be_classified,nkz_id,profession_id,predicted_class,predicted_probability
0,12308,Преподаватель-учитель Турецкого языка,образовательный центр zeyn qoi это место учатс...,2023-11-09,True,,,16,0.171699
1,12443,Ассистент Декана (Высшая школа образования),содействие планировании рабочего дня декана ве...,2023-10-24,True,,,137,0.576031


In [99]:
dfa = pd.read_excel('dataset/df_3_final_realy_really.xlsx')

In [101]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dfa['label'] = encoder.fit_transform(dfa['class'])

In [130]:
dfa[dfa['class']==4]

In [131]:
dfa[dfa['label']==247]

Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.2,Unnamed: 0,name,description,class,id,Unnamed: 0.1,Unnamed: 0.3,Unnamed: 0.4,label
37017,38594.0,43518.0,31699.0,32985.0,Юрист юридической службы группы компаний,группа компаний dolce это холдинговая структур...,297,100080,,,,247
37018,38595.0,43519.0,31700.0,32986.0,Ведущий юрист в Банк,обязанности юридическое курирование вопросов ф...,297,100151,,,,247
37019,38596.0,43520.0,31701.0,32987.0,Юрист по разработке проектов НПА,важный социально экологический проект временно...,297,100583,,,,247
37020,38597.0,43521.0,31702.0,32988.0,Помощник юриста со знанием английского языка,требуемый опыт работы опыта полная занятость п...,297,100670,,,,247
37021,38598.0,43522.0,31703.0,32989.0,Помощник старшего юриста,обязанности поиск юридической информации темат...,297,100678,,,,247
...,...,...,...,...,...,...,...,...,...,...,...,...
37209,38786.0,43710.0,31891.0,33180.0,Юрист в строительную компанию,обязанности заключение договоров подрядчиками ...,297,10500,,,,247
37210,38787.0,43711.0,31892.0,33181.0,Юрист в строительную компанию,обязанности работа местными государственными о...,297,10500,,,,247
37211,38788.0,43712.0,31893.0,33182.0,Юрист в строительную компанию,обязанности составлять договора аренды купли п...,297,10500,,,,247
37212,38789.0,43713.0,31894.0,33183.0,Юрист в строительную компанию,работа успешной турецкой компании цели должнос...,297,10500,,,,247


In [None]:
from huggingface_hub import upload_file

upload_file(
    "<path_to_file>/config.json",
    path_in_repo="config.json",
    repo_id="<namespace>/dummy-model",
)