In [169]:
import pandas as pd
import os
import win32com.client as win32
import unicodedata
from pathlib import Path
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import fitz
from tqdm import tqdm
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                             roc_auc_score)

In [170]:
DATA_ROOT_PATH = Path("../data/preprocessed/new_interns")

In [None]:
new_interns_df = pd.read_csv(DATA_ROOT_PATH / "new_interns_preprocessed.csv")
new_interns_df

In [172]:
BASE_MODEL_PATH = Path("../models/prediction_model/base_model")
TRAINED_MODEL_PATH = Path("../models/prediction_model/trained_model")

In [173]:
base_model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL_PATH,
                                                                num_labels=2,
                                                                device_map='cpu')
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH,
                                          use_safetensors=True,
                                          device_map='cpu')

prediction_model = PeftModel.from_pretrained(base_model, TRAINED_MODEL_PATH)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ..\models\prediction_model\base_model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [174]:
def predict_interview_outcome(text: str,
                              tokenizer: AutoTokenizer,
                              prediction_model: PeftModel
                              ) -> int:
    inputs = tokenizer(
        text, return_tensors="pt", padding=True, truncation=True
    )

    with torch.no_grad():
        outputs = prediction_model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=-1).item()

    if not isinstance(predictions, int):
        predictions = int(predictions)

    return predictions

def extract_text_from_pdf(filepath: str | Path) -> str:
    pdf_document = fitz.open(filepath)
    text = ""

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    return text

In [None]:
new_interns_df["Prediction"] = None
new_interns_df

In [None]:
for i, row in tqdm(new_interns_df.iterrows()):
    text = extract_text_from_pdf(f"../data/preprocessed/{row['Резюме']}")
    new_interns_df.loc[i, "Prediction"] = predict_interview_outcome(text,
                                                                    tokenizer,
                                                                    prediction_model)
    
new_interns_df

In [177]:
new_interns_df["Prediction"] = new_interns_df["Prediction"].astype(int)
new_interns_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Фамилия      85 non-null     object
 1   Имя          85 non-null     object
 2   Резюме       85 non-null     object
 3   Hire status  85 non-null     int64 
 4   Prediction   85 non-null     int32 
dtypes: int32(1), int64(1), object(3)
memory usage: 3.1+ KB


In [178]:
def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    roc_auc = roc_auc_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc auc': roc_auc
    }

In [179]:
y_true = new_interns_df["Hire status"]
y_pred = new_interns_df["Prediction"]

In [180]:
metrics = compute_metrics(y_true, y_pred)

print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1: {metrics['f1']:.4f}")
print(f"ROC AUC: {metrics['roc auc']:.4f}")

Accuracy: 0.4941
Precision: 0.5660
Recall: 0.4941
F1: 0.5049
ROC AUC: 0.5163
