In [3]:
import os
import sys
import mlflow

sys.path.append(os.path.abspath(".."))

from recruitair.job_offers.models import KeyCriteriaResponse

mlflow.set_tracking_uri("http://nattech.fib.upc.edu:40380/")

In [4]:
mlflow.langchain.autolog()
mlflow.set_experiment("criteria-extraction")

<Experiment: artifact_location='mlflow-artifacts:/900495739893713381', creation_time=1759680270113, experiment_id='900495739893713381', last_update_time=1759680270113, lifecycle_stage='active', name='criteria-extraction', tags={'domain': 'recruitair',
 'mlflow.experimentKind': 'genai_development',
 'task': 'criteria-extraction'}>

In [5]:
MLFLOW_PROMPT_NAME = "criteria-extraction"
MLFLOW_PROMPT_VERSION = 1
OLLAMA_MODEL = "dolphin3"
OLLAMA_MODEL_VERSION = "8b"

In [64]:
import pandas as pd

sample_job_offer = """
We are looking for a Software Engineer with experience in Python and machine
learning. The ideal candidate should have at least 3 years of experience in
software development, a strong understanding of algorithms and data structures,
and the ability to work in a fast-paced environment. Familiarity with cloud
platforms like AWS or GCP is a plus. Excellent communication skills and the
ability to work in a team are essential.
"""

target = {
    "key_criteria": [
        {"name": "Python Experience", "importance": 5},
        {"name": "Machine Learning Knowledge", "importance": 5},
        {"name": "3 years Software Development Experience", "importance": 5},
        {"name": "Algorithms & Data Structures Understanding", "importance": 4},
        {"name": "Cloud Platforms Familiarity", "importance": 3},
        {"name": "Communication Skills", "importance": 5},
        {"name": "Teamwork Ability", "importance": 4},
    ]
}


data = pd.DataFrame({"inputs": [{"job_offer_text": sample_job_offer}], "expectations": [target]})

In [53]:
from langchain_ollama import ChatOllama


@mlflow.trace
def predict(job_offer_text: str) -> KeyCriteriaResponse:
    llm = ChatOllama(model=f"{OLLAMA_MODEL}:{OLLAMA_MODEL_VERSION}", temperature=0)
    prompt = mlflow.genai.load_prompt(f"prompts:/{MLFLOW_PROMPT_NAME}/{MLFLOW_PROMPT_VERSION}")
    response = llm.with_structured_output(prompt.response_format, method="json_schema").invoke(
        prompt.format(job_offer_text=job_offer_text)
    )
    return KeyCriteriaResponse.model_validate(response)

In [62]:
from mlflow.entities import Feedback
import ollama

import numpy as np


@mlflow.genai.scorer(name="target-recall/embedding/mxbai-embed-large:335m")
def target_recall(outputs: KeyCriteriaResponse, expectations: KeyCriteriaResponse) -> Feedback:
    expectations = KeyCriteriaResponse.model_validate(expectations)
    # Compute the embeddings of the names of the all the extracted and target criteria:
    target_embeddings = []
    for target_criterion in expectations.key_criteria:
        target_embeddings.append(ollama.embed("mxbai-embed-large:335m", input=target_criterion.name)["embeddings"][0])
    response_embeddings = []
    for response_criterion in outputs.key_criteria:
        response_embeddings.append(
            ollama.embed("mxbai-embed-large:335m", input=response_criterion.name)["embeddings"][0]
        )
    # Compute the cosine similarity matrix between the two sets of embeddings:
    similarity_matrix = np.inner(np.array(response_embeddings), np.array(target_embeddings))

    # We'll score as follows: For each target criterion, we'll find the most similar
    # response criterion, thus we'll have, for each target criterion, a score
    # between 0 and 1 representing how well it was matched. We'll then floor
    # everything below 0.8 to 0, and average the rest.
    # This means that if a target criterion was not matched with at least 0.8,
    # it will contribute 0 to the average. We'll call this "target recall score".
    # It can be interpreted as the "rich fraction" of target criteria that were well matched.
    scores = similarity_matrix.max(axis=0)
    target_recall_score = float(np.where(scores < 0.8, 0, scores).mean())
    return Feedback(value=target_recall_score)

In [65]:
with mlflow.start_run(run_name="prompt-evaluation"):
    mlflow.log_param("ollama_model", OLLAMA_MODEL)
    mlflow.log_param("ollama_model_version", OLLAMA_MODEL_VERSION)
    mlflow.log_param("temperature", 0)
    mlflow.log_param("mlflow_prompt_name", MLFLOW_PROMPT_NAME)
    mlflow.log_param("mlflow_prompt_version", MLFLOW_PROMPT_VERSION)

    mlflow.genai.evaluate(
        predict_fn=predict,
        data=data,
        scorers=[target_recall],
    )

2025/10/05 19:16:49 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset.
Evaluating: 100%|██████████| 1/1 [Elapsed: 00:04, Remaining: 00:00] 
