In [1]:
import pandas as pd
from uptrain import EvalLLM, Evals



In [2]:
# Load CSV with query, response, context, and ground truth
data = pd.read_csv('llm_responses.csv')

# Extract the necessary fields from the CSV
queries = data['query']
responses = data['response']
contexts = data['context']

In [3]:
# Initialize UpTrain EvalLLM instance
eval_llm = EvalLLM(openai_api_key="")

In [5]:
# Evaluate response relevance metric and save the findings
metric_name = 'relevance'
metric = Evals.RESPONSE_RELEVANCE

results = []
for query, response in zip(queries, responses):
    metrics_result = {
        'query': query,
        'response': response
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "response": response
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[metric])
    metrics_result[metric_name] = res[0].get('score_response_relevance', 'N/A')
    metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_response_relevance', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")

100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
  with ThreadPoolExecutor(max_workers=1) as executor:
100%|██████████| 1/1 [00:01<00:00,  1.90s/it]
[32m2024-10-15 17:46:22.101[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m376[0m - [1mLocal server not running, start the server to log data and visualize in the dashboard![0m
100%|██████████| 1/1 [00:01<00:00,  1.96s/it]
100%|██████████| 1/1 [00:01<00:00,  1.79s/it]
[32m2024-10-15 17:46:36.635[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m376[0m - [1mLocal server not running, start the server to log data and visualize in the dashboard![0m
100%|██████████| 1/1 [00:01<00:00,  1.82s/it]
100%|██████████| 1/1 [00:01<00:00,  2.00s/it]
[32m2024-10-15 17:46:53.442[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m376[0m - [1mLocal server not running, start the server to log data and visualize in the dashboard![0m
100%|██████████| 1/

Evaluation for relevance complete. Results saved to 'evaluation_results_relevance.csv'


In [7]:
# Evaluate response consistency metric and save the findings
metric_name = 'consistency'
metric = Evals.RESPONSE_CONSISTENCY

results = []
for query, response, context in zip(queries, responses, contexts):
    metrics_result = {
        'query': query,
        'contexts': context,
        'response': response
    }

    # Prepare data for evaluation
    data_for_eval = [{
        "question": query,
        "context": context,
        "response": response
    }]

    # Evaluate the metric
    res = eval_llm.evaluate(data=data_for_eval, checks=[metric])
    metrics_result[metric_name] = res[0].get('score_response_consistency', 'N/A')
    metrics_result[f'{metric_name}_explanation'] = res[0].get('explanation_response_consistency', 'No explanation provided')

    results.append(metrics_result)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(f'evaluation_results_{metric_name}.csv', index=False)

print(f"Evaluation for {metric_name} complete. Results saved to 'evaluation_results_{metric_name}.csv'")

100%|██████████| 1/1 [00:04<00:00,  4.42s/it]
  with ThreadPoolExecutor(max_workers=1) as executor:
[32m2024-10-15 18:04:12.502[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m376[0m - [1mLocal server not running, start the server to log data and visualize in the dashboard![0m
100%|██████████| 1/1 [00:04<00:00,  4.22s/it]
[32m2024-10-15 18:04:23.810[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m376[0m - [1mLocal server not running, start the server to log data and visualize in the dashboard![0m
100%|██████████| 1/1 [00:04<00:00,  4.82s/it]
[32m2024-10-15 18:04:35.701[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m376[0m - [1mLocal server not running, start the server to log data and visualize in the dashboard![0m
100%|██████████| 1/1 [00:03<00:00,  3.47s/it]
[32m2024-10-15 18:04:46.273[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36

Evaluation for consistency complete. Results saved to 'evaluation_results_consistency.csv'
