In [1]:
import os
import sys
import mlflow
import pandas as pd
import json
from langchain_ollama import ChatOllama

from codecarbon import EmissionsTracker
import mlflow
from typing import List
import ollama
from functools import lru_cache
import numpy as np
from mlflow.entities import Feedback

sys.path.append(os.path.abspath(".."))

from recruitair.job_offers.models import KeyCriteriaResponse

os.environ["codecarbon_log_level"] = "WARNING"  # Disable most of the loggings
os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
mlflow.set_tracking_uri("https://ml-4cb370e118ec407c83eed254868ebce1.ecs.eu-north-1.on.aws/")

In [2]:
MLFLOW_PROMPT_NAME = "one-shot-long-descriptions"
MLFLOW_PROMPT_VERSION = 1
OLLAMA_MODEL = "dolphin3"
OLLAMA_MODEL_VERSION = "8b"

In [3]:
with open("../data/interim/preprocessed_jobs.jsonl", "r") as f:
    lines = f.readlines()

job_offers = []
criteria = []
for line in lines:
    job_offer = json.loads(line)
    job_offers.append({"job_offer_text": job_offer["job_description"]})
    criteria.append({"key_criteria": job_offer["criteria"]})

data = pd.DataFrame({"inputs": job_offers, "expectations": criteria})

In [4]:
def predict(job_offer_text: str) -> KeyCriteriaResponse:
    llm = ChatOllama(model=f"{OLLAMA_MODEL}:{OLLAMA_MODEL_VERSION}", temperature=0)
    prompt = mlflow.genai.load_prompt(f"prompts:/{MLFLOW_PROMPT_NAME}/{MLFLOW_PROMPT_VERSION}")
    response = llm.with_structured_output(prompt.response_format, method="json_schema").invoke(
        prompt.format(job_offer_text=job_offer_text)
    )
    return KeyCriteriaResponse.model_validate(response)

In [5]:
@lru_cache
def get_embedding(text: str) -> np.ndarray:
    return np.array(ollama.embed("mxbai-embed-large:335m", input=text)["embeddings"][0])

In [6]:
@mlflow.genai.scorer(name="target-recall/embedding/mxbai-embed-large:335m")
def target_recall(outputs: KeyCriteriaResponse, expectations: KeyCriteriaResponse) -> Feedback:
    expectations = KeyCriteriaResponse.model_validate(expectations)
    # Compute the embeddings of the names of the all the extracted and target criteria:
    target_embeddings = []
    for target_criterion in expectations.key_criteria:
        target_embeddings.append(get_embedding(target_criterion.name))
    response_embeddings = []
    for response_criterion in outputs.key_criteria:
        response_embeddings.append(get_embedding(response_criterion.name))
    # Compute the cosine similarity matrix between the two sets of embeddings:
    similarity_matrix = np.inner(np.array(response_embeddings), np.array(target_embeddings))

    # We'll score as follows: For each target criterion, we'll find the most similar
    # response criterion, thus we'll have, for each target criterion, a score
    # between 0 and 1 representing how well it was matched. We'll then floor
    # everything below 0.8 to 0, and average the rest.
    # This means that if a target criterion was not matched with at least 0.8,
    # it will contribute 0 to the average. We'll call this "target recall score".
    # It can be interpreted as the "rich fraction" of target criteria that were well matched.
    scores = similarity_matrix.max(axis=0)
    target_recall_score = float(np.where(scores < 0.8, 0, scores).mean())
    return Feedback(value=target_recall_score)

In [7]:
@mlflow.genai.scorer(name="importance-mse/embedding/mxbai-embed-large:335m")
def importance_mse(outputs: KeyCriteriaResponse, expectations: KeyCriteriaResponse) -> Feedback:
    expectations = KeyCriteriaResponse.model_validate(expectations)
    # Compute the embeddings of the names of the all the extracted and target criteria:
    target_embeddings = []
    for target_criterion in expectations.key_criteria:
        target_embeddings.append(get_embedding(target_criterion.name))
    response_embeddings = []
    for response_criterion in outputs.key_criteria:
        response_embeddings.append(get_embedding(response_criterion.name))
    # Compute the cosine similarity matrix between the two sets of embeddings:
    similarity_matrix = np.inner(np.array(response_embeddings), np.array(target_embeddings))

    # Find which target is more likely to be for each response:
    response_to_target: List[int] = similarity_matrix.argmax(axis=1)
    response_to_target[similarity_matrix.max(axis=1) < 0.5] = -1  # Consider as unmatched if similarity < 0.5
    # Compute the mean square error of matched importance:
    mse = 0
    for i in range(len(outputs.key_criteria)):
        if response_to_target[i] == -1:
            mse += 100**2  # Penalize completely unmatched criteria
            continue
        response_criterion = outputs.key_criteria[i]
        target_criterion = expectations.key_criteria[response_to_target[i]]
        mse += (response_criterion.importance - target_criterion.importance) ** 2
    mse /= len(outputs.key_criteria)

    return Feedback(value=mse)

In [None]:
mlflow.langchain.autolog()
mlflow.set_experiment(f"criteria-extraction/{MLFLOW_PROMPT_NAME}")

with mlflow.start_run(
    run_name=f"eval-{MLFLOW_PROMPT_NAME}-v{MLFLOW_PROMPT_VERSION}-{OLLAMA_MODEL}-v{OLLAMA_MODEL_VERSION}"
) as run:

    mlflow.log_param("emissions-tracker/measure-power-secs", 1)
    mlflow.log_param("emissions-tracker/tracking-mode", "process")
    mlflow.log_param("ollama_model", OLLAMA_MODEL)
    mlflow.log_param("ollama_model_version", OLLAMA_MODEL_VERSION)
    mlflow.log_param("temperature", 0)
    mlflow.log_param("mlflow_prompt_name", MLFLOW_PROMPT_NAME)
    mlflow.log_param("mlflow_prompt_version", MLFLOW_PROMPT_VERSION)

    tracker = EmissionsTracker(measure_power_secs=1, tracking_mode="process", save_to_file=False)
    tracker.start()
    mlflow.genai.evaluate(
        predict_fn=predict,
        data=data,
        scorers=[
            target_recall,
            importance_mse,
        ],
    )
    tracker.stop()
    all_metrics = tracker.final_emissions_data.values
    num_metrics = {f"emissions-tracker/{k}": v for k, v in all_metrics.items() if isinstance(v, (int, float))}
    mlflow.log_metrics(num_metrics, run_id=run.info.run_id)

2025/12/03 20:32:12 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/12/03 20:32:12 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
 Windows OS detected: Please install Intel Power Gadget to measure CPU

2025/12/03 20:32:15 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2025/12/03 20:32:15 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.
  from .autonotebook import tqdm as notebook_tqdm
  return func(*args, **kwargs)
Evaluating:   0%|          | 0/849 [Elapsed: 00:00, Remaining: ?] 