## 1 Data preparation

#### 1.1 Load data

In [None]:
from pydantic.schema import datetime

from module_text_llm.text_evaluation.data_loader import load_data

data = load_data('data')

In [None]:
from module_text_llm.text_evaluation.data_processor import process_data

exercises = process_data(data)

#### 1.2 (Optional) Verify that data has expected format

In [None]:
from module_text_llm.text_evaluation.data_visualization import display_exercise_summaries

display_exercise_summaries(exercises, max_rows=5)

In [None]:
from module_text_llm.text_evaluation.data_visualization import print_feedbacks

print_feedbacks(exercises, exercise_id_to_find=544)

## 2 Single LLM-as-a-Judge request

In [None]:
from module_text_llm.text_evaluation.prompts.llm_evaluation_prompt import get_formatted_prompt
from module_text_llm.text_evaluation.prompts.metrics import correctness, actionability, completeness, tone
from module_text_llm.text_evaluation.data_util import find_exercise_submission


exercise, submission = find_exercise_submission(exercises, exercise_id_to_find=4066)
metrics = [actionability, correctness, completeness, tone]

assessments = {assessment.id: assessment for assessment in submission.assessments}
assessment = assessments.get("LLM")
feedbacks = assessment.feedbacks


prompt = get_formatted_prompt(exercise, submission, feedbacks, metrics)

In [None]:
from module_text_llm.helpers.models import evaluation_model
from module_text_llm.text_evaluation.llm_util import get_logprobs_langchain


# TODO: Uncomment the following line to test the LLM model
# assessment.meta['evaluation'] = get_logprobs_langchain(prompt, evaluation_model)

### Plot logprobs for the evaluation

In [None]:
from module_text_llm.text_evaluation.data_visualization import plot_top_logprobs

result = assessment.meta.get('evaluation')

if not result is None:
    # Parse the response using PydanticOutputParser
    parsed_response = result.parsed_response
    
    # Assuming the result.response_metadata contains logprobs data for each evaluation
    logprobs_content = result.response['response_metadata']['logprobs']['content']
    
    # Initialize a set to track used logprobs indices
    used_indices = set()
    
    # Iterate through each MetricEvaluation
    for i, evaluation in enumerate(parsed_response.evaluations):
        score_str = str(evaluation.score)
        
        # Find the correct index for the score in the logprobs content
        for j, logprobs_data in enumerate(logprobs_content):
            if j not in used_indices and logprobs_data['token'] == score_str:
                used_indices.add(j)  # Mark this index as used
                logprobs = logprobs_data['top_logprobs']
                selected_token = logprobs_data['token']
                
                # Plot the logprobs for the selected score
                print(f"Plotting logprobs for score '{score_str}' in metric '{evaluation.title}'")
                plot_top_logprobs(logprobs, selected_token)
                break
    

## 3 Experiment with single vs multiple Metrics

In [None]:
from module_text_llm.text_evaluation.prompts.llm_evaluation_prompt import get_formatted_prompt
from module_text_llm.text_evaluation.prompts.metrics import correctness, actionability, completeness, tone

# TODO: Uncomment the following lines to evaluate the submissions with multiple metrics
# metrics = [actionability, correctness, completeness, tone]
# print("# Prompt for all metrics:")
# for exercise in exercises:
#     for submission in exercise.submissions:
#         assessments = {assessment.id: assessment for assessment in submission.assessments}
#         for assessmentStr in ["Tutor", "LLM", "CoFee"]:
#             assessment = assessments.get(assessmentStr)
#             feedbacks = assessment.feedbacks
# 
#             print(f"Prompt for {assessmentStr}, Exercise {exercise.id}, Submission {submission.id} with all metrics")
#             prompt = get_formatted_prompt(exercise, submission, feedbacks, metrics)
#             result = get_logprobs_langchain(prompt, evaluation_model)
#             assessment.meta['evaluation_all_metrics'] = result

# TODO: Uncomment the following lines to evaluate the submissions with single metrics
# print("# Prompt for single metrics:")
# for exercise in exercises:
#     for submission in exercise.submissions:
#         assessments = {assessment.id: assessment for assessment in submission.assessments}
#         for assessmentStr in ["Tutor", "LLM", "CoFee"]:
#             assessment = assessments.get(assessmentStr)
#             feedbacks = assessment.feedbacks
# 
#             for metric in metrics:
#                 print(f"Prompt for {assessmentStr}, Exercise {exercise.id}, Submission {submission.id} with single metric {metric.title}")
#                 prompt = get_formatted_prompt(exercise, submission, feedbacks, [metric])
#                 result = get_logprobs_langchain(prompt, evaluation_model)
#                 assessment.meta['evaluation_single_' + metric.title] = result

## 3. Save the Exercises with Assessments and Evaluations to a JSON file

In [None]:
import os
import json
from pathlib import Path
from module_text_llm.text_evaluation.evaluation_schemas import Exercise


def save_to_json(data: Exercise, directory: str, filename: str) -> None:
    """Save the Exercise data to a JSON file in a specified directory."""
    Path(directory).mkdir(parents=True, exist_ok=True)
    json_data = json.loads(data.json())
    file_path = os.path.join(directory, filename)

    with open(file_path, 'w') as file:
        json.dump(json_data, file, indent=4)

directory = 'evaluations'
filename = f'{exercise.id}_evaluation_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'

# Save the Exercise data to the specified directory with the generated filename
save_to_json(exercise, directory, filename)

### Load saved evaluations from JSON file

In [None]:
import os
import json
from module_text_llm.text_evaluation.evaluation_schemas import Exercise

# TODO: Uncomment the following lines to restore the results from the JSON file
directory = 'evaluations'
filename = '4066_evaluation_20240830_071358.json'
filepath = os.path.join(directory, filename)


exercises = []
with open(filepath, 'r') as file:
    file_data = json.load(file)
    exercise = Exercise.parse_obj(file_data)
    exercises.append(exercise)

print(f"Loaded {len(exercises)} exercises from file '{filename}'")
    

## 4. Plot logprobs for saved evaluations

In [None]:
from module_text_llm.text_evaluation.data_visualization import plot_top_logprobs

def plot_result(result):
    if not result is None:
        # Parse the response using PydanticOutputParser
        parsed_response = result.get('parsed_response')
        
        # Assuming the result.response_metadata contains logprobs data for each evaluation
        logprobs_content = result.get('response')['response_metadata']['logprobs']['content']
        
        # Initialize a set to track used logprobs indices
        used_indices = set()
        
        # Iterate through each MetricEvaluation
        for i, evaluation in enumerate(parsed_response.get('evaluations')):
            score_str = str(evaluation.get('score'))
            
            # Find the correct index for the score in the logprobs content
            for j, logprobs_data in enumerate(logprobs_content):
                if j not in used_indices and logprobs_data['token'] == score_str:
                    used_indices.add(j)  # Mark this index as used
                    logprobs = logprobs_data['top_logprobs']
                    selected_token = logprobs_data['token']
                    
                    # Plot the logprobs for the selected score
                    print(f"Plotting logprobs for score '{score_str}' in metric '{evaluation.get('title')}'")
                    plot_top_logprobs(logprobs, selected_token)
                    break
                    
print("Plotting logprobs for all metrics")
for exercise in exercises:
    for submission in exercise.submissions:
        for assessment in submission.assessments:
            result = assessment.meta.get('evaluation_all_metrics')
            plot_result(result)
            
print("Plotting logprobs for single metrics")
for exercise in exercises:
    for submission in exercise.submissions:
        for assessment in submission.assessments:
            for metric in ["Actionability", "Correctness", "Completeness", "Tone"]:
                result = assessment.meta.get('evaluation_single_' + metric)
                plot_result(result)
            