In [1]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pymupdf, fitz

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [3]:
def get_text_with_links(document):
    result_text = ""
    
    for page_num in range(len(document)):
        page = document.load_page(page_num)

        # Получаем слова и их координаты
        words = page.get_text("words")
        links = page.get_links()

        # Временный словарь для отслеживания последнего индекса слова для каждого URL
        last_occurrence_with_url = {}

        wc_map = {tuple(word[:4]): word[4] for word in words}
        
        for link in links:
            link_rect = fitz.Rect(link["from"])
            uri = link.get('uri', 'Нет ссылки')
            
            for i, word in enumerate(words):
                text = word[4]   # текст слова
                word_rect = fitz.Rect(word[:4])

                # Проверка на пересечение координат
                if word_rect.intersects(link_rect):
                    last_occurrence_with_url[uri] = (i, tuple(word[:4]))

        # Заменяем текст в конечном словаре ссылками на последних вхождениях
        for uri, (index, rect) in last_occurrence_with_url.items():
            wc_map[rect] = f'{wc_map[rect]}:"{uri}"'

        # Сортировка слов по их координатам для правильного порядка
        sorted_words = sorted(wc_map.items(), key=lambda x: (x[0][1], x[0][0]))

        for item in sorted_words:
            result_text += item[1] + " "
    
    return result_text.strip()  # добавляем strip для удаления лишних пробелов в конце

In [4]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large', device_map="cuda")
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large', device_map="cuda")

In [5]:
df = pd.read_csv('./interns_preprocessed/interns_preprocessed.csv').dropna().reset_index(drop=True)

In [6]:
df['text']=None
df['embedding']=None

In [7]:
for i, pdf_file in enumerate(tqdm(df['Резюме'])):
    pdf_file = pdf_file.replace('\\', '\\\\')
    #print(pdf_file)
    try:
        with pymupdf.open(pdf_file) as doc:
            text = get_text_with_links(doc)
    except:
        print('битый файл')
        continue
    
    batch_dict = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to('cuda')
    embeddings = average_pool(model(**batch_dict).last_hidden_state, batch_dict['attention_mask'])
    df.loc[i, ['text']] = [text]
    df.loc[i, ['embedding']] = [embeddings.tolist()]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 311/311 [00:11<00:00, 26.71it/s]


In [8]:
df['label'] = df['Hire status']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import SVC
import optuna
import numpy as np

optuna.logging.set_verbosity(optuna.logging.WARNING)

TEST_SIZE = 0.10
SEED = 0
DEFAULT_PARAMS = {
    "max_iter": -1,
    "class_weight": "balanced",
    "random_state": SEED
}

def get_best_params_and_metrics(data: np.ndarray, target: np.ndarray) -> dict:
    class EarlyStoppingCallback:
        def __call__(self, study: optuna.Study, trial: optuna.Trial) -> None:
            if study.best_value == 1.0:
                study.stop()

    def objective(trial):
        params = {
            "C": trial.suggest_float("C", 0, 10000.0, log=False),#True
            "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
            "degree": trial.suggest_int("degree", 1, 10),
            "tol": trial.suggest_float("tol", 1e-5, 0.1, log=False)#True
        }
        params.update(DEFAULT_PARAMS)
        model = SVC(**params)
        model.fit(train_data, train_target)
        predictions = model.predict(eval_data)
        score = f1_score(eval_target, predictions, average='weighted')
        return score

    train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=TEST_SIZE, stratify=target,
                                                                        random_state=SEED)
    eval_data, test_data, eval_target, test_target = train_test_split(test_data, test_target, test_size=0.5, stratify=test_target,
                                                                      random_state=SEED)
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, callbacks=[EarlyStoppingCallback()], n_trials=10000)  # change!
    best_params = study.best_params
    best_params.update(DEFAULT_PARAMS)

    model = SVC(**best_params)
    model.fit(train_data, train_target)
    train_pred = model.predict(train_data).reshape(-1)
    eval_pred = model.predict(eval_data).reshape(-1)
    test_pred = model.predict(test_data).reshape(-1)

    f1_train = f1_score(train_target, train_pred)
    f1_eval = f1_score(eval_target, eval_pred)
    f1_test = f1_score(test_target, test_pred)

    return {
        'best_params': best_params,
        'metrics': {
            'train': f1_train,
            'eval': f1_eval,
            'test': f1_test
        }
    }

In [None]:
data = df['embedding'].apply(lambda x: x[0]).to_list()
target = df['label'].to_numpy()
params_and_metrics = get_best_params_and_metrics(data, target)
params, metrics = params_and_metrics.get('best_params'), params_and_metrics.get('metrics')
model = SVC(**params)
model.fit(data, target)
print(metrics)

[I 2024-12-03 21:26:37,105] A new study created in memory with name: no-name-e59c94d8-8dec-4a4c-91b4-a0e11b8ed8ee
[I 2024-12-03 21:26:37,126] Trial 0 finished with value: 0.375 and parameters: {'C': 9860.897722380687, 'kernel': 'rbf', 'degree': 3, 'tol': 0.06017812414761777}. Best is trial 0 with value: 0.375.
[I 2024-12-03 21:26:37,160] Trial 1 finished with value: 0.43529411764705883 and parameters: {'C': 2685.1646808176656, 'kernel': 'linear', 'degree': 2, 'tol': 0.0014357035353252305}. Best is trial 1 with value: 0.43529411764705883.
[I 2024-12-03 21:26:37,178] Trial 2 finished with value: 0.375 and parameters: {'C': 5225.609575484172, 'kernel': 'rbf', 'degree': 8, 'tol': 0.0968039961085455}. Best is trial 1 with value: 0.43529411764705883.
[I 2024-12-03 21:26:37,208] Trial 3 finished with value: 0.30980392156862746 and parameters: {'C': 2713.8963840880383, 'kernel': 'poly', 'degree': 7, 'tol': 0.0015820424619017257}. Best is trial 1 with value: 0.43529411764705883.
[I 2024-12-03 2

In [16]:
print(metrics)

{'train': 0.7518796992481203, 'eval': 0.7142857142857143, 'test': 0.5}
