# Evaluation

In [7]:
os.environ["MISTRAL_API_KEY"] = ""

In [197]:
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
import json 

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

def run_mistral(user_message, model="open-mistral-7b", is_json=False):
    client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY"))
    messages = [ChatMessage(role="user", content=user_message)]

    if is_json:
        chat_response = client.chat(
            model=model,temperature=0.25, messages=messages, response_format={"type": "json_object"}
        )
    else:
        chat_response = client.chat(model=model, messages=messages, temperature=0.2)

    return chat_response.choices[0].message.content
    
current_folder = globals()['_dh'][0]

file_path = os.path.join(current_folder,"data_eval_mistral_finetuned_lr_0_9e-6_115epochs.jsonl")

data_list = read_jsonl(file_path)


In [199]:
eval_rubrics = [
    {
        "metric": "relevance",
        "rubrics": """
        Score 1: The conversation is not relevant to the mentee's goals or needs.
        Score 2: The conversation is somewhat relevant, but has significant flaws in addressing the mentee's goals or needs.
        Score 3: The conversation is mostly relevant to the mentee's goals or needs and effectively addresses key points.
        Score 4: The conversation is highly relevant and provides additional valuable insights or guidance.
        """
    },
    {
        "metric": "clarity",
        "rubrics": """
        Score 1: The conversation is difficult to understand due to unclear communication.
        Score 2: The conversation is somewhat clear, but has significant flaws in communication.
        Score 3: The conversation is mostly clear and easy to understand.
        Score 4: The conversation is highly clear, with concise and effective communication.
        """
    },
    {
        "metric": "engagement",
        "rubrics": """
        Score 1: The mentee shows little to no engagement in the conversation.
        Score 2: The mentee shows some engagement, but it is limited or inconsistent.
        Score 3: The mentee is mostly engaged and actively participates in the conversation.
        Score 4: The mentee is highly engaged, showing enthusiasm and active participation.
        """
    },
    {
        "metric": "supportiveness",
        "rubrics": """
        Score 1: The mentor provides little to no support or encouragement.
        Score 2: The mentor provides some support, but it is limited or lacks depth.
        Score 3: The mentor is mostly supportive, offering meaningful encouragement and assistance.
        Score 4: The mentor is highly supportive, providing significant encouragement and valuable assistance.
        """
    }
]


In [201]:
def average_ratings_from_strings(ratings_list):
    categories = {}
    for rating_str in ratings_list:
        rating = json.loads(rating_str)
        for key, value in rating.items():
            if key not in categories:
                categories[key] = []
            if isinstance(value, (int, float)):  # Ensure value is a number
                categories[key].append(value)
    
    # Remove empty categories
    categories = {key: values for key, values in categories.items() if values}
    
    averages = {key: sum(values) / len(values) for key, values in categories.items()}
    return averages

In [211]:
scoring_prompt = """
Please read the provided transcript of the conversation between the mentor and the mentee. Based on the specified evaluation metric and rubrics, assign an integer score between 1 and 4 to the conversation. Then, return a JSON object with the metric as the key and the evaluation score as the value.

# Evaluation metric:
{metric}

# Evaluation rubrics:
{rubrics}

# Conversation Transcript
{conversation}
"""

eval_list = []

for conversation in data_list:
    for i in eval_rubrics:
        eval_output = run_mistral(
            scoring_prompt.format(
                conversation=conversation, metric=i["metric"], rubrics=i["rubrics"]
            ),
            model="mistral-large-latest",
            is_json=True,
        )
        eval_list.append(eval_output)
        print(eval_output)

{"relevance": 4}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 4}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 4}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 3}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 4}
{"clarity": 3}
{"engagement": 4}
{"supportiveness": 4}
{"relevance": 4}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 4}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 4}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 3}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}
{"relevance": 3}
{"clarity": 3}
{"engagement": 3}
{"supportiveness": 4}


In [212]:
average_ratings_from_strings(eval_list)

{'relevance': 3.7, 'clarity': 3.0, 'engagement': 3.1, 'supportiveness': 4.0}