# Response Analysis

In [1]:
import pandas as pd
import os
from pathlib import Path
from typing import Dict, List
import openai
import json
from tqdm.notebook import tqdm

In [None]:
from questions.apple import quiz_questions as apple_questions
from questions.cs2 import quiz_questions as cs2_questions

apple_mapper = {v['question']: int(k) for k,v in apple_questions.items()}
cs2_mapper = {v['question']: int(k) + len(apple_mapper) + 1 for k,v in cs2_questions.items()}

# Combine them
q_mapper = {**apple_mapper, **cs2_mapper}
q_mapper

In [None]:
df = pd.read_csv('./results/results_0902025.csv')
df['qid'] = df['question'].map(q_mapper)
df.head(2)

In [4]:
n_ids = df['qid'].unique()
n_models = df['model'].unique()
n_access = df['access'].unique()
n_users = df['user'].unique()

all_dfs = []
for id in n_ids:
    for access in n_access:
        for user in n_users:
            try:
                filtered_df = df.loc[
                    (df['qid'].isin([id])) &
                    # (df['model'].isin([model])) &
                    (df['access'].isin([access])) &
                    (df['user'].isin([user]))
                ].copy()

                # Create Prompt for Evaluation
                prompt = f"""You are an expert evaluator of LLM responses. Please analyze and rank the following responses to the same question.
                Question: {filtered_df['question'].values[0]}
                Correct Answer: {filtered_df['answer'].values[0]}
                Reference for Correct Answer in Original Text: {filtered_df['reference'].values[0]}

                Responses to evaluate:
                {'-' * 50}
                """

                for idx, row in filtered_df.iterrows():
                    prompt += f"\n{row['model']}:\n{row['response']}\n{'-' * 50}\n"

                prompt += """\nPlease evaluate each response based on:
                1. Accuracy and factual correctness
                2. Completeness of the answer
                3. Clarity and coherence
                4. Relevance to the question
                5. Proper use of available context

                Provide a scoring from best to worst, with scores (0-10) and detailed explanations.
                Format your response in a structured way that can be parsed into the following JSON schema:
                {
                    "evaluations": [
                        {
                            "model_name": "model name",
                            "score": score (0-10),
                            "reasoning": "detailed explanation"
                        }
                    ],
                    "meta_analysis": "overall analysis of patterns and differences between responses"
                }

                Check the response json format before returning it.
                """

                # Get evaluation from GPT-4
                response = openai.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": "You are an expert evaluator of LLM responses."},
                        {"role": "user", "content": prompt}
                    ],
                    response_format={ "type": "json_object" }
                )

                # Parse response
                try:
                    eval_dict = json.loads(response.choices[0].message.content)
                    eval_df = pd.DataFrame(eval_dict['evaluations'])

                    # Create mappings from eval_df using model names as keys
                    score_mapping = dict(zip(eval_df['model_name'], eval_df['score']))
                    reasoning_mapping = dict(zip(eval_df['model_name'], eval_df['reasoning']))

                    # Map values to filtered_df with default values for missing models
                    filtered_df['score'] = filtered_df['model'].map(score_mapping).fillna(-1)
                    filtered_df['reasoning'] = filtered_df['model'].map(reasoning_mapping).fillna('Model evaluation failed')
                    filtered_df['meta_analysis'] = eval_dict['meta_analysis']

                    
                    # filtered_df['score'] = eval_df['score'].values
                    # filtered_df['reasoning'] = eval_df['reasoning'].values
                    # filtered_df['meta_analysis'] = eval_dict['meta_analysis']

                    all_dfs.append(filtered_df)
                    # evaluations.append(eval_dict)
                    # print(eval_dict)
                except json.JSONDecodeError as e:
                    print(f"Error for {id} {access} {user}")
                    print(f"Error parsing JSON: {e}")
            except Exception as e:
                print(f"Error for {id} {access} {user}")
                print(f"Error: {e}")

            # break
    #     break
    # break

In [None]:
# Combine all at once after the loop
final_df = pd.concat(all_dfs, ignore_index=True)
# final_df.to_csv('./results/results_0902025_with_scores.csv', index = False)

# Eval with Opik

In [None]:
final_df = pd.read_csv('./results/results_0902025_with_scores.csv')
final_df.head(2)

In [None]:
from opik.evaluation.metrics import ContextPrecision, ContextRecall, Usefulness, AnswerRelevance, Hallucination, LevenshteinRatio
import opik

opik.configure(use_local=False)

def calculate_metrics(df):
    """
    Calculate various metrics for each row in the dataframe.
    
    Args:
        df: Pandas DataFrame containing columns 'question', 'response', 'reference', 'answer'
    
    Returns:
        DataFrame with additional metric columns
    """
    # Create copy to avoid modifying original
    result_df = df.copy()
    
    # Initialize metric columns
    metric_columns = ['levenshtein_ratio', 'levenshtein_ratio_reason',
                     'hallucination_score', 'hallucination_score_reason', 
                     'answer_relevance', 'answer_relevance_reason',
                     'usefulness', 'usefulness_reason',
                     'context_precision', 'context_precision_reason',
                     'context_recall', 'context_recall_reason']
    for col in metric_columns:
        if col.split('_')[-1] == 'reason':
            result_df[col] = pd.Series(dtype='object', data = 'N/A')
        else:
            result_df[col] = pd.Series(dtype='float64', data = -1.0)
        
    # Calculate metrics for each row
    for idx, row in tqdm(result_df.iterrows(), total=len(result_df)):
        try:
            # LevenshteinRatio
            metric = LevenshteinRatio()
            score = metric.score(output=row['response'], reference=row['reference'])
            result_df.at[idx, 'levenshtein_ratio'] = score
            result_df.at[idx, 'levenshtein_ratio_reason'] = score.reason
        except:
            pass
            
        try:
            # Hallucination
            metric = Hallucination(model="gpt-4o-mini")
            score = metric.score(
                input=row['question'],
                output=row['response'],
                context=[row['reference']]
            )
            result_df.at[idx, 'hallucination_score'] = score.value
            result_df.at[idx, 'hallucination_score_reason'] = score.reason
        except:
            pass
            
        try:
            # AnswerRelevance
            metric = AnswerRelevance(model="gpt-4o-mini")
            score = metric.score(
                input=row['question'],
                output=row['response'],
                context=[row['reference']]
            )
            result_df.at[idx, 'answer_relevance'] = score.value
            result_df.at[idx, 'answer_relevance_reason'] = score.reason
        except:
            pass
            
        try:
            # Usefulness
            metric = Usefulness(model="gpt-4o-mini")
            score = metric.score(
                input=row['question'],
                output=row['response']
            )
            result_df.at[idx, 'usefulness'] = score.value
            result_df.at[idx, 'usefulness_reason'] = score.reason
        except:
            pass
            
        try:
            # ContextPrecision
            metric = ContextPrecision(model="gpt-4o-mini")
            score = metric.score(
                input=row['question'],
                output=row['response'],
                expected_output=row['answer'],
                context=[row['reference']]
            )
            result_df.at[idx, 'context_precision'] = score.value
            result_df.at[idx, 'context_precision_reason'] = score.reason
        except:
            pass
            
        try:
            # ContextRecall
            metric = ContextRecall(model="gpt-4o-mini")
            score = metric.score(
                input=row['question'],
                output=row['response'],
                expected_output=row['answer'],
                context=[row['reference']]
            )
            result_df.at[idx, 'context_recall'] = score.value
            result_df.at[idx, 'context_recall_reason'] = score.reason
        except:
            pass
            
    return result_df

In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

final_df_opik = calculate_metrics(final_df)
# final_df_opik.to_csv('./results/results_0902025_with_scores_and_metrics.csv', index = False)
final_df_opik.head(2)

In [None]:
final_df_opik.columns

## Metrics

In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
import ollama
list(ollama.list())[0][1]

In [None]:
# results_old = pd.read_csv('./results/results_with_scores_and_metrics.csv')
# # results_old = results_old[results_old['access'] != 'DOCUMENT_LEVEL'].copy()
# results_new = pd.read_csv('./results/results_0902025_with_scores_and_metrics.csv')

# # First keep the filtered results_old
# results_old = results_old[results_old['access'] != 'DOCUMENT_LEVEL'].copy()

# # Filter results_new for DOCUMENT_ONLY entries
# document_only_new = results_new[results_new['access'] == 'DOCUMENT_LEVEL'].copy()

# # Combine the dataframes
# combined_df = pd.concat([results_old, document_only_new], ignore_index=True)

# # Verify the combination
# print("Access types in combined dataset:", combined_df['access'].unique())
# print("\nNumber of entries in results_old:", len(results_old))
# print("Number of DOCUMENT_ONLY entries added:", len(document_only_new))
# print("Total entries in combined dataset:", len(combined_df))

In [None]:
# # Create complete model mapping dictionary
# model_mapping = {
#     # Direct matches (keep results_new version)
#     'deepseek-r1:1.5b': 'deepseek-r1:1.5b',
#     'mistral-small:24b': 'mistral-small:24b',
#     'llama3.2:1b': 'llama3.2:1b',
#     'qwen2.5:7b': 'qwen2.5:7b',
#     'qwen2.5:3b': 'qwen2.5:3b',
#     'qwen2.5:1.5b': 'qwen2.5:1.5b',
#     'qwen2.5:0.5b': 'qwen2.5:0.5b',
#     'qwen2.5:32b': 'qwen2.5:32b',
#     'qwen2.5:14b': 'qwen2.5:14b',
#     'phi3.5:3.8b': 'phi3.5:3.8b',
#     'phi4:14b': 'phi4:14b',
    
#     # Similar model mappings (old -> new)
#     'deepseek-r1:32b': 'deepseek-r1:8b',  # Map to closest available size
#     'llama3.2:latest': 'llama3.2:3b',     # Map latest to specific version
#     'phi3.5:latest': 'phi3.5:3.8b',       # Map latest to specific version
#     'phi4:latest': 'phi4:14b',            # Map latest to specific version
#     'smollm:135m': 'smollm2:135m',        # Map to v2 versions
#     'smollm:360m': 'smollm2:360m',
#     'smollm:1.7b': 'smollm2:1.7b'
# }

# # Print unique models before mapping
# print("Unique models before mapping:", combined_df['model'].unique())

# # Map models and handle unmapped cases
# def map_model(model):
#     if pd.isna(model):
#         return model
#     return model_mapping.get(model, model)  # Return original if no mapping exists

# combined_df['model'] = combined_df['model'].apply(map_model)

# # Save combined dataset
# combined_df.to_csv('./results/combined_results.csv', index=False)

In [None]:
# access_model_df = combined_df[combined_df['access'] == 'DOCUMENT_LEVEL'].groupby(['model', 'user'])[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean().reset_index()
# access_model_df.sort_values(by='response_time', ascending=True)

In [None]:
# # Calculate means for each user and model combination
# user1_df = access_model_df[access_model_df['user'] == 'user1'].groupby('model')[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean()
# admin_df = access_model_df[access_model_df['user'] == 'admin'].groupby('model')[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean()

# # Calculate percentage difference
# # Formula: ((user1 - admin) / admin) * 100
# percent_diff = ((admin_df - user1_df) / user1_df * 100).round(2)

# # Add a column for average difference across all metrics (excluding response_time)
# metric_cols = ['score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']
# percent_diff['avg_difference'] = percent_diff[metric_cols].mean(axis=1).round(2)

# # Sort by average difference
# percent_diff_sorted = percent_diff.sort_values('avg_difference', ascending=False)

# print("Percentage difference (user1 compared to admin):")
# print("\nPositive values mean user1's values are higher")
# print("Negative values mean admin's values are higher")
# # print("\n", percent_diff_sorted)

# # Optional: Save to CSV
# percent_diff_sorted

In [None]:
# metric_df = combined_df.groupby(['model'])[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean().reset_index() #.to_csv('./results/model_metrics.csv', index = False)
# metric_df.sort_values(by='context_recall', ascending=False).head(2)

# Plotting

In [118]:
import seaborn as sns
import matplotlib.pyplot as plt

def config_seaborn():
    palette = ["#fa00e1", "#0019fa", "#00fa15", "#00faee", "#faaf00", "#9200fa", "#edfa00", "#c805e6", "#3214f5", "#0046cc", "#00cd43", "#49d7b8", "#03e5b3", "#32ebbe", "#c8be30", "#e58c32", "#a723c8"]
    sns_palette = sns.color_palette(palette, len(palette))
    sns.set_style('whitegrid')
    sns.set_context('talk')

    return sns_palette

In [None]:
palette = config_seaborn()

# Create figure
plt.figure(figsize=(25, 10))

# Create grouped bar plot
ax = sns.barplot(
    data=access_model_df,
    x='model',
    y='score',
    hue='user',
    palette=palette
)

# Customize plot
# plt.title('Statistical Measures Across Embedding Models', pad=20, size=14)
plt.xlabel('LLM Model', size=25)
plt.ylabel('Score (0-10)', size=25)

# Rotate x-axis labels
plt.xticks(rotation=15, fontsize=22)
plt.yticks(fontsize=22)

# Adjust legend
plt.legend(
    title='Input Type',
    bbox_to_anchor=(0.5, 1.05),
    loc='center',
    ncol=4,
    frameon=True,
    fontsize=24,
    title_fontsize=28
)

# Add grid for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

# Adjust layout
plt.tight_layout()

# plt.savefig('./plots/model_performance.png', dpi=300)
# Show plot
plt.show()

In [None]:
palette = config_seaborn()

# Create figure
plt.figure(figsize=(25, 13))

# Create grouped bar plot

models_to_drop = ['qwen2.5:32b', 'qwen2.5:14b']
_df = combined_df[~combined_df['model'].isin(models_to_drop)].copy()

# Create new access types based on user and access combination
def modify_access(row):
    if row['access'] == 'DOCUMENT_LEVEL':
        return f'DOCUMENT_LEVEL_{row["user"].upper()}'
    return row['access']

# Apply the modification to the DataFrame
_df['access'] = _df.apply(modify_access, axis=1)


ax = sns.barplot(
    data=_df,
    x='model',
    y='score',
    hue='access',
    palette=palette
)

# Customize plot
# plt.title('Statistical Measures Across Embedding Models', pad=20, size=14)
plt.xlabel('LLM Model', size=25)
plt.ylabel('Score (0-10)', size=25)

# Rotate x-axis labels
plt.xticks(rotation=45, fontsize=28)
plt.yticks(fontsize=28)

# Adjust legend
plt.legend(
    title='Input Type',
    bbox_to_anchor=(0.5, 1.05),
    loc='center',
    ncol=5,
    frameon=True,
    fontsize=26,
    title_fontsize=33
)

# Add grid for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

# Adjust layout
plt.tight_layout()

plt.savefig('./plots/model_performance.png', dpi=300)
# Show plot
plt.show()

In [None]:
_df.groupby('access')[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean().reset_index().sort_values(by='score', ascending=False)

In [None]:
_df[_df['user'] == 'user1'].groupby('access')[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean().reset_index().sort_values(by='score', ascending=False)

In [None]:
palette = config_seaborn()

scatter_df = final_df_opik.groupby(['model', 'access', 'user'])[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean().reset_index()
def min_max_scaling(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Apply to a single column
scatter_df['normalized_response_time'] = min_max_scaling(scatter_df['response_time'])
# Create figure
plt.figure(figsize=(33, 12))

# Create grouped bar plot
ax = sns.scatterplot(
    x="answer_relevance", 
    y="usefulness",
    hue="model", 
    size="response_time",
    palette=palette,
    sizes=(100, 600), 
    linewidth=0,
    data=scatter_df, 
)

# Customize plot
# plt.title('Statistical Measures Across Embedding Models', pad=20, size=14)
plt.xlabel('Answer Relevance', size=38)
plt.ylabel('Usefullness', size=38)

# Add grid for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

legend = plt.legend(
    ncol=5,
    fontsize=32,
)
legend.get_texts()[0].set_text('MODEL')
legend.get_texts()[17].set_text('RESPONSE TIME (s)')

plt.tick_params(axis='both', labelsize=32)

# Adjust layout
plt.tight_layout()

plt.savefig('./plots/model_relevance_usefulness_performance.png', dpi=300)
# Show plot
plt.show()

In [None]:
from matplotlib.patches import Rectangle

palette = config_seaborn()

scatter_df = final_df_opik.groupby(['model', 'access', 'user'])[['response_time', 'score', 'hallucination_score', 'answer_relevance', 'usefulness', 'context_precision', 'context_recall']].mean().reset_index()
# def min_max_scaling(column):
#     min_val = column.min()
#     max_val = column.max()
#     return (column - min_val) / (max_val - min_val)

# Apply to a single column
# scatter_df['normalized_response_time'] = min_max_scaling(scatter_df['response_time'])
# Create figure
plt.figure(figsize=(30,6))

# Create grouped bar plot
ax = sns.scatterplot(
    x="context_precision", 
    y="context_recall",
    hue="model", 
    size="response_time",
    palette=palette,
    sizes=(100, 600), 
    linewidth=0,
    data=scatter_df, 
)
# rect = Rectangle((7, 0.5), 3, 0.5, color='gray', alpha=0.5, transform=ax.transData)
# ax.add_patch(rect)
# ax.text(8.5, 0.75, 'better performance', color='black', fontsize=10, ha='center', va='center')

# Customize plot
# plt.title('Statistical Measures Across Embedding Models', pad=20, size=14)
plt.xlabel('Precision', size=38)
plt.ylabel('Recall', size=38)
plt.legend('', frameon=False)
plt.tick_params(axis='both', labelsize=32)

# Add grid for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

# Adjust layout
plt.tight_layout()

plt.savefig('./plots/model_precision_recall_performance.png', dpi=300)
# Show plot
plt.show()