In [None]:
import pandas as pd
from uptrain import EvalLLM, Evals

In [2]:
# Load CSV with query, response, context, and ground truth
data = pd.read_csv('llm_responses.csv')

# Extract the necessary fields from the CSV
queries = data['query']
responses = data['response']
contexts = data['context']
ground_truths = data['ground truth']


In [3]:
# Initialize UpTrain EvalLLM instance
eval_llm = EvalLLM(openai_api_key="")


In [None]:
# Evaluate response relevance metric and save the findings
metric_name = 'relevance'
metric = Evals.RESPONSE_RELEVANCE

results = []
for query, response in zip(queries, responses):
    metrics_result = {
        'query': query,
        'response': response
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "response": response
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[metric])
    metrics_result[metric_name] = res[0].get('score_response_relevance', 'N/A')
    metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_response_relevance', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")

In [None]:
# Evaluate response consistency metric and save the findings
metric_name = 'consistency'
metric = Evals.RESPONSE_CONSISTENCY

results = []
for query, response, context in zip(queries, responses, contexts):
    metrics_result = {
        'query': query,
        'context': context,
        'response': response
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "context": context,
        "response": response
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[metric])
    metrics_result[metric_name] = res[0].get('score_response_consistency', 'N/A')
    metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_response_consistency', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")

In [None]:
# Evaluate factual accuracy metric and save the findings
metric_name = 'factual accuracy'
metric = Evals.FACTUAL_ACCURACY

results = []
for query, response, context in zip(queries, responses, contexts):
    metrics_result = {
        'query': query,
        'context': context,
        'response': response
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "context": context,
        "response": response
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[metric])
    metrics_result[metric_name] = res[0].get('score_factual_accuracy', 'N/A')
    metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_factual_accuracy', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")

In [None]:
# Evaluate response conciseness metric and save the findings
metric_name = 'response conciseness'
metric = Evals.RESPONSE_CONCISENESS

results = []
for query, response in zip(queries, responses):
    metrics_result = {
        'query': query,
        'response': response
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "response": response
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[metric])
    metrics_result[metric_name] = res[0].get('score_response_conciseness', 'N/A')
    metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_response_conciseness', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")

In [None]:
# Evaluate response matching metric and save the findings
from uptrain import ResponseMatching

metric_name = 'response matching llm'
#metric = Evals.RESPONSE_CONCISENESS

results = []
for query, response, ground_truth, context in zip(queries, responses, ground_truths, contexts):
    metrics_result = {
        'query': query,
        'ground_truth': ground_truth,
        'response': response, 
        'context': context
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "ground_truth": ground_truth,
        "response": response,
        "context": context
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[ResponseMatching(method='llm')])
    print(res)
    metrics_result[metric_name] = res[0].get('score_response_match_llm', 'N/A')
    #metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_response_match', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")

In [None]:
# Evaluate response matching metric and save the findings
from uptrain import ResponseMatching

metric_name = 'response matching rouge'
#metric = Evals.RESPONSE_CONCISENESS

results = []
for query, response, ground_truth, context in zip(queries, responses, ground_truths, contexts):
    metrics_result = {
        'query': query,
        'ground_truth': ground_truth,
        'response': response, 
        'context': context
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "ground_truth": ground_truth,
        "response": response,
        "context": context
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[ResponseMatching(method='rouge')])
    print(res)
    metrics_result[metric_name] = res[0].get('score_response_match_rouge', 'N/A')
    #metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_response_match', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")