In [38]:
! pip install litellm
! pip install prometheus_eval
! pip install nest_asyncio

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [39]:
import os
from litellm import completion
from prometheus_eval import PrometheusEval
from prometheus_eval.litellm import LiteLLM
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Set your Groq API key
os.environ['GROQ_API_KEY'] = "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"

# Initialize the LiteLLM model with Groq
model = LiteLLM('groq/llama-3.1-70b-versatile')
judge = PrometheusEval(model=model, absolute_grade_template=ABSOLUTE_PROMPT)

def get_prometheus_score(question, reference_answer, candidate_answer):
    rubric_data = {
        "criteria": "Is the model proficient in generating a relevant, faithful, and complete answer to the question?",
        "score1_description": "The generated answer is not relevant to the user query and reference answer.",
        "score2_description": "The generated answer is similar to the reference answer but not relevant to the user query.",
        "score3_description": "The generated answer is relevant to the user query and reference answer but contains mistakes.",
        "score4_description": "The generated answer is relevant to the user query and is similar to the reference answer but is not as concise.",
        "score5_description": "The generated answer is relevant to the user query and fully correct according to the reference answer."
    }

    score_rubric = SCORE_RUBRIC_TEMPLATE.format(**rubric_data)

    # Run the evaluation
    _, score = judge.single_absolute_grade(
        instruction=question,
        response=candidate_answer,
        rubric=score_rubric,
        reference_answer=reference_answer
    )

    return score

# Example usage
question = "What are the fox and the dog doing?"
reference_answer = "The quick brown fox jumps over the lazy black dog."
candidate_answer_1 = "The quick brown fox jumps over the lazy black dog."
candidate_answer_2 = "The fast fox leaps over the idle dog."
candidate_answer_3 = "A spaceship landed on Mars and discovered alien life."


score = get_prometheus_score(question, reference_answer, candidate_answer_3)
print("Score:", score)


100%|██████████| 1/1 [00:00<00:00,  1.89it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 16320.25it/s]

Score: 1





In [40]:
import pandas as pd

# Assuming the get_prometheus_score function and other necessary imports have already been done

def apply_prometheus_score(df):
    # Apply the get_prometheus_score function to each row
    df['prometheus_score'] = df.apply(
        lambda row: get_prometheus_score(
            question=row['question'],
            reference_answer=row['reference_answer'],
            candidate_answer=row['correct_answer']
        ), axis=1
    )
    return df

# Example usage
data = {
    'question': ["What is the capital of France?", "What is 2 + 2?"],
    'correct_answer': ["Paris", "4"],
    'reference_answer': ["Paris", "4"]
}

df = pd.DataFrame(data)

# Apply the function to the DataFrame
df_with_scores = apply_prometheus_score(df)

# Display the DataFrame with the new column
print(df_with_scores)

100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 16448.25it/s]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 15196.75it/s]

                         question correct_answer reference_answer  \
0  What is the capital of France?          Paris            Paris   
1                  What is 2 + 2?              4                4   

   prometheus_score  
0                 5  
1                 5  





In [41]:
import pandas as pd

def apply_prometheus_score(df):
    counter = 0
    total_rows = len(df)

    def compute_score(row):
        nonlocal counter
        counter += 1
        print(f"Processing row {counter}/{total_rows}")
        score = get_prometheus_score(
            question=row['question'],
            reference_answer=row['correct_answer'],
            candidate_answer=row['gemma2_27B_answer']
        )
        print(f"Score for row {counter}: {score}")  # Debugging print
        return score

    # Apply the get_prometheus_score function to each row with a counter
    df['score_gemma2_27B_answer'] = df.apply(compute_score, axis=1)
    return df

# Example usage
df = pd.read_csv(
    '/home/ubuntu/iris_repos/llm_evaluation_thesis/analysis/Analysis 3/survey_prometheus6.csv'
)

# Apply the function to the DataFrame
df_prometheus = apply_prometheus_score(df)

# Save the results to a CSV file
df_prometheus.to_csv(
    '/home/ubuntu/iris_repos/llm_evaluation_thesis/analysis/Analysis 3/survey_prometheus7.csv',
    index=False
)



Processing row 1/25


100%|██████████| 1/1 [00:10<00:00, 10.64s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 13751.82it/s]


Score for row 1: 3
Processing row 2/25


100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 13888.42it/s]


Score for row 2: 1
Processing row 3/25


100%|██████████| 1/1 [00:02<00:00,  2.93s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 14217.98it/s]


Score for row 3: 4
Processing row 4/25


100%|██████████| 1/1 [00:01<00:00,  1.59s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 13842.59it/s]


Score for row 4: 4
Processing row 5/25


100%|██████████| 1/1 [00:01<00:00,  1.00s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 12483.05it/s]


Score for row 5: 4
Processing row 6/25


100%|██████████| 1/1 [00:00<00:00,  1.01it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 14614.30it/s]


Score for row 6: 1
Processing row 7/25


100%|██████████| 1/1 [00:02<00:00,  2.11s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 15252.01it/s]


Score for row 7: 4
Processing row 8/25


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 12446.01it/s]


Score for row 8: 4
Processing row 9/25


100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 11983.73it/s]


Score for row 9: 1
Processing row 10/25


100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 13797.05it/s]


Score for row 10: 1
Processing row 11/25


100%|██████████| 1/1 [00:04<00:00,  4.58s/it]


Retrying failed batches: Attempt 1/10


100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 11683.30it/s]


Score for row 11: 4
Processing row 12/25


100%|██████████| 1/1 [00:02<00:00,  2.56s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 12985.46it/s]


Score for row 12: 4
Processing row 13/25


100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 11983.73it/s]


Score for row 13: 4
Processing row 14/25


100%|██████████| 1/1 [00:00<00:00,  1.82it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 13148.29it/s]


Score for row 14: 3
Processing row 15/25


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 11459.85it/s]


Score for row 15: 1
Processing row 16/25


100%|██████████| 1/1 [00:00<00:00,  1.06it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 11125.47it/s]


Score for row 16: 1
Processing row 17/25


100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 12372.58it/s]


Score for row 17: 4
Processing row 18/25


100%|██████████| 1/1 [00:00<00:00,  1.36it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 11155.06it/s]


Score for row 18: 1
Processing row 19/25


100%|██████████| 1/1 [00:00<00:00,  1.50it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 12372.58it/s]


Score for row 19: 1
Processing row 20/25


100%|██████████| 1/1 [00:00<00:00,  1.29it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 11428.62it/s]


Score for row 20: 4
Processing row 21/25


100%|██████████| 1/1 [00:00<00:00,  1.23it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 10699.76it/s]


Score for row 21: 4
Processing row 22/25


100%|██████████| 1/1 [00:00<00:00,  1.27it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 12787.51it/s]


Score for row 22: 4
Processing row 23/25


100%|██████████| 1/1 [00:00<00:00,  1.90it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 16320.25it/s]


Score for row 23: 3
Processing row 24/25


100%|██████████| 1/1 [00:00<00:00,  1.40it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 13148.29it/s]


Score for row 24: 4
Processing row 25/25


100%|██████████| 1/1 [00:00<00:00,  1.47it/s]


Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 13573.80it/s]

Score for row 25: 3



