In [1]:
import os
from dotenv import load_dotenv

load_dotenv("api.env")
INDOX_API_KEY = os.getenv("INDOX_API_KEY")

In [2]:
from indox.IndoxEval.llms import IndoxApi
llm = IndoxApi(api_key=INDOX_API_KEY)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
test_case = {
                    "messages": [
                        {"query": "What are the best books for learning Python?",
                         "llm_response": "Automate the Boring Stuff with Python, Python Crash Course, etc."},
                        {"query": "What is the capital of France?", "llm_response": "The capital of France is Paris."}
                    ]
                }

In [11]:
test_messages = [
    {
        "query": "What is the capital of France?",
        "llm_response": "The capital of France is Paris."
    },
    {
        "query": "Can you tell me about the Eiffel Tower?",
        "llm_response": "The Eiffel Tower, located in Paris, is one of the most iconic landmarks in the world."
    },
    {
        "query": "What are some famous foods in France?",
        "llm_response": "France is known for its cuisine, including dishes like croissants, baguettes, and cheese."
    }
]

In [12]:
query = "What is the capital of Japan?"
llm_response = "The capital of Japan is Tokyo."
retrieval_context = ["Tokyo is the most populous city in Japan and serves as the country's political and economic center.",
                     "Tokyo has been the capital since 1868, after the Meiji Restoration moved the capital from Kyoto.",
                     "The city is known for its mix of modern architecture and traditional temples, as well as its bustling districts like Shibuya and Shinjuku."]

In [13]:
from indox.IndoxEval import Faithfulness
faithfulness_evaluator = Faithfulness(llm_response=llm_response,retrieval_context=retrieval_context)

In [14]:
from indox.IndoxEval import AnswerRelevancy
answer_relevancy_metric = AnswerRelevancy(query=query,llm_response=llm_response)

In [15]:
from indox.IndoxEval import Bias
bias_metric = Bias(llm_response=llm_response)

In [16]:
from indox.IndoxEval import ContextualRelevancy
contextual = ContextualRelevancy(query=query,retrieval_context=retrieval_context)

In [17]:
from indox.IndoxEval import GEval
geval = GEval(parameters="Rag application",query=query,llm_response=llm_response,ground_truth="Tokyo",context="geographic knowledge",
              retrieval_context=retrieval_context)

In [18]:
from indox.IndoxEval import KnowledgeRetention
knowledge_retention = KnowledgeRetention(messages=test_messages)

In [19]:
from indox.IndoxEval import Hallucination
hallucination = Hallucination(llm_response=llm_response,retrieval_context=retrieval_context)

In [20]:
from indox.IndoxEval import Toxicity
toxicity = Toxicity(messages=test_messages)

## Classic Metrics

In [2]:
# from indox.IndoxEval import BertScore
# bertscore = BertScore(llm_response=llm_response, retrieval_context=retrieval_context) 

In [21]:
from indox.IndoxEval import BLEU
bleu = BLEU(llm_response=llm_response, retrieval_context=retrieval_context) 

In [22]:
from indox.IndoxEval import Rouge
rouge = Rouge(llm_response=llm_response, retrieval_context=retrieval_context) 

In [23]:
from indox.IndoxEval import METEOR
meteor = METEOR(llm_response=llm_response, retrieval_context=retrieval_context) 

In [24]:
from indox.IndoxEval import Evaluator
evaluator = Evaluator(model=llm,metrics=[contextual,faithfulness_evaluator,answer_relevancy_metric,
                                         hallucination,toxicity,geval,knowledge_retention,bias_metric,bleu,rouge,meteor])

[32mINFO[0m: [1mEvaluator initialized with model and metrics.[0m
[32mINFO[0m: [1mModel set for all metrics.[0m


In [25]:
result = evaluator.evaluate()

[32mINFO[0m: [1mEvaluating metric: ContextualRelevancy[0m
[32mINFO[0m: [1mCompleted evaluation for metric: ContextualRelevancy[0m
[32mINFO[0m: [1mEvaluating metric: Faithfulness[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Faithfulness[0m
[32mINFO[0m: [1mEvaluating metric: AnswerRelevancy[0m
[32mINFO[0m: [1mCompleted evaluation for metric: AnswerRelevancy[0m
[32mINFO[0m: [1mEvaluating metric: Hallucination[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Hallucination[0m
[32mINFO[0m: [1mEvaluating metric: Toxicity[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Toxicity[0m
[32mINFO[0m: [1mEvaluating metric: GEval[0m
[32mINFO[0m: [1mCompleted evaluation for metric: GEval[0m
[32mINFO[0m: [1mEvaluating metric: KnowledgeRetention[0m
[32mINFO[0m: [1mCompleted evaluation for metric: KnowledgeRetention[0m
[32mINFO[0m: [1mEvaluating metric: Bias[0m
[32mINFO[0m: [1mCompleted evaluation for metric: Bias[0m
[32mINFO[0

In [26]:
result

{'contextual_relevancy': {'verdicts': [{'verdict': 'yes',
    'reason': 'No reason provided'},
   {'verdict': 'yes', 'reason': 'No reason provided'},
   {'verdict': 'no',
    'reason': 'The context provided does not mention anything about the capital of Japan, which is the main focus of the input question.'}],
  'reason': {'reason': 'The score is 0 because the context provided does not mention anything related to the capital of Japan or provide any information about it.'}},
 'faithfulness': {'claims': ['The capital of Japan is Tokyo.'],
  'truths': ['The capital of Japan is Tokyo.'],
  'verdicts': [{'verdict': 'yes', 'reason': None}],
  'reason': 'The score is 1 because the actual output correctly states that the capital of Japan is Tokyo, aligning perfectly with the information presented in the retrieval context.'},
 'answer_relevancy': {'score': 1,
  'reason': 'The score is 1.00 because the answer is perfectly relevant to the input provided. Great job!',
  'statements': [],
  'verdic

In [None]:
{'contextual_relevancy': {'verdicts': [{'verdict': 'yes',
    'reason': 'No reason provided'},
   {'verdict': 'yes', 'reason': 'No reason provided'},
   {'verdict': 'no',
    'reason': 'The context provided does not mention anything about the capital of Japan, which is the main focus of the input question.'}],
  'reason': {'reason': 'The score is 0 because the context provided does not mention anything related to the capital of Japan or provide any information about it.'}},
 'faithfulness': {'claims': ['The capital of Japan is Tokyo.'],
  'truths': ['The capital of Japan is Tokyo.'],
  'verdicts': [{'verdict': 'yes', 'reason': None}],
  'reason': 'The score is 1 because the actual output correctly states that the capital of Japan is Tokyo, aligning perfectly with the information presented in the retrieval context.'},
 'answer_relevancy': {'score': 1,
  'reason': 'The score is 1.00 because the answer is perfectly relevant to the input provided. Great job!',
  'statements': [],
  'verdicts': []},
 'hallucination': {'score': 0.6666666666666666,
  'reason': "The score is 0.67 because the actual output lacks detail and contradicts the description of Tokyo's mix of modern architecture and traditional temples, as well as bustling districts like Shibuya and Shinjuku in the provided context.",
  'verdicts': [{'verdict': 'yes',
    'reason': "The actual output agrees with the provided context which states that Tokyo is the most populous city in Japan and serves as the country's political and economic center."},
   {'verdict': 'no',
    'reason': 'The actual output lacks detail but does not contradict the provided context which states that Tokyo has been the capital since 1868.'},
   {'verdict': 'no',
    'reason': "The actual output contradicts the provided context which describes Tokyo's mix of modern architecture and traditional temples, as well as bustling districts like Shibuya and Shinjuku."}]},
 'toxicity': {'score': 0,
  'reason': 'The score is 0.00 because there are no reasons provided for toxicity, indicating a non-toxic output.',
  'opinions': [],
  'verdicts': []},
 'geval': '{\n    "score": 8,\n    "reason": "The Rag application response is relevant, accurate, and well-incorporated, but lacks comprehensive contextuality."\n}',
 'knowledge_retention': {'score': 1.0,
  'reason': 'The score is 1.00 because there are no attritions indicating forgetfulness, demonstrating strong knowledge retention.',
  'verdicts': [{'index': 0, 'verdict': 'no', 'reason': None},
   {'index': 1, 'verdict': 'no', 'reason': None},
   {'index': 2, 'verdict': 'no', 'reason': None}],
  'knowledges': [{'Capital of France': 'Paris'},
   {'Capital of France': 'Paris', 'Landmark': 'Eiffel Tower'},
   {'Capital of France': 'Paris',
    'Landmark': 'Eiffel Tower',
    'Famous foods in France': []}]},
 'bias': {'score': 0,
  'reason': 'The score is 0.00 because there are no reasons provided for bias in the actual output, indicating a lack of bias.',
  'opinions': [],
  'verdicts': []},
 'BLEU': {'score': 0.027160352634090387},
 'Rouge': {'score': {'Precision': 0.611111111111111,
   'Recall': 0.21079852230747373,
   'F1-score': 0.3118916732110135}},
 'Meteor': {'score': 0.7132217453882491}}