## 1 Data preparation

#### 1.1 Load data

In [None]:
from module_text_llm.text_evaluation.data_processor import remove_feedback_titles

remove_feedback_titles('data')

In [None]:
from module_text_llm.text_evaluation.data_loader import load_data

data = load_data('data')

In [None]:
from module_text_llm.text_evaluation.data_processor import process_data

exercises = process_data(data)

#### 1.2 (Optional) Verify that data has expected format

In [None]:
from module_text_llm.text_evaluation.data_visualization import display_exercise_summaries

display_exercise_summaries(exercises, max_rows=5)

In [None]:
from module_text_llm.text_evaluation.data_visualization import print_feedbacks

print_feedbacks(exercises, exercise_id_to_find=544)

## 2 Prepare LLM-as-a-Judge

#### 2.1 Get model and adapt

In [None]:
from module_text_llm.text_evaluation.evaluation_schemas import Feedback, Submission
from module_text_llm.prompts.llm_evaluation_prompt import system_message, human_message
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)
human_message_prompt = HumanMessagePromptTemplate.from_template(human_message)

chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

def feedback_to_dict(feedback: Feedback, submission: Submission):
    line_start, line_end = get_line_range_from_index_range(feedback.index_start, feedback.index_end, submission.text)
    return {
        "description": feedback.description,
        "line_start": line_start,
        "line_end": line_end
    }

In [None]:
from module_text_llm.prompts.metrics import correctness, actionability, completeness, tone

metrics = [actionability, correctness, completeness, tone]

In [None]:
from module_text_llm.prompts.metrics import MetricEvaluations
import json
from langchain.output_parsers import PydanticOutputParser

from module_text_llm.helpers.utils import add_sentence_numbers, get_line_range_from_index_range, format_grading_instructions
from module_text_llm.text_evaluation.data_util import find_exercise_submission

exercise, submission = find_exercise_submission(exercises, exercise_id_to_find = 4066)
feedbacks = submission.feedbacks["CoFee"]

output_parser = PydanticOutputParser(pydantic_object=MetricEvaluations)

prompt_input = {
    "problem_statement": exercise.problem_statement or "No problem statement.",
    "example_solution": exercise.example_solution,
    "grading_instructions": format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria),
    "metrics": json.dumps([metric.dict() for metric in metrics]),
    "format_instructions": output_parser.get_format_instructions(),
    "submission": add_sentence_numbers(submission.text),
    "feedbacks": json.dumps([feedback_to_dict(feedback, submission) for feedback in feedbacks]),
}

In [None]:
print(chat_prompt_template.format(**prompt_input))

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.callbacks import get_openai_callback
from langchain.chat_models import AzureChatOpenAI

from module_text_llm.helpers.models import evaluation_model

def get_logprobs_langchain(prompt_template: ChatPromptTemplate, prompt_input: dict, model: AzureChatOpenAI):
    # Render the chat prompt template with the given inputs
    prompt = prompt_template.format(**prompt_input)
    
    # Invoke the model with the formatted prompt
    with get_openai_callback() as cb:
        response = model.invoke(prompt, max_tokens=100, logprobs=True, top_logprobs=5, temperature=0)
        print(f"Total Cost (USD): ${format(cb.total_cost, '.6f')}")
        return response

result = get_logprobs_langchain(chat_prompt_template, prompt_input, evaluation_model)

In [None]:
from module_text_llm.text_evaluation.data_visualization import plot_top_logprobs

# Parse the response using PydanticOutputParser
parsed_response = output_parser.parse(result.content)

# Assuming the result.response_metadata contains logprobs data for each evaluation
logprobs_content = result.response_metadata['logprobs']['content']

# Initialize a set to track used logprobs indices
used_indices = set()

# Iterate through each MetricEvaluation
for i, evaluation in enumerate(parsed_response.evaluations):
    score_str = str(evaluation.score)
    
    # Find the correct index for the score in the logprobs content
    for j, logprobs_data in enumerate(logprobs_content):
        if j not in used_indices and logprobs_data['token'] == score_str:
            used_indices.add(j)  # Mark this index as used
            logprobs = logprobs_data['top_logprobs']
            selected_token = logprobs_data['token']
            
            # Plot the logprobs for the selected score
            print(f"Plotting logprobs for score '{score_str}' in metric '{evaluation.title}'")
            plot_top_logprobs(logprobs, selected_token)
            break