In [1]:
!!pip install ragas langchain_google_genai google-generativeai datasets pandas

import os
import time
import json
import pandas as pd
from datasets import Dataset
from kaggle_secrets import UserSecretsClient

# RAGAS
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_precision
from ragas.llms import LangchainLLMWrapper

# Gemini
from langchain_google_genai import ChatGoogleGenerativeAI
from google.generativeai import configure, embed_content

print("Starting ...")
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*LangchainLLMWrapper.*")



KEY_ORDER = ["1", "2", "3", "4", "5"]     # 5 keys
QUESTIONS_PER_KEY = 7                  # 7 rows
WAIT_TIME = 61                          # wait time 


user_secrets = UserSecretsClient()

class NativeGoogleEmbedding:

    def embed_query(self, text):
        out = embed_content(
            model="models/text-embedding-004",
            content=text
        )
        return out["embedding"]

    def embed_documents(self, texts):
        results = []
        for t in texts:
            out = embed_content(
                model="models/text-embedding-004",
                content=t
            )
            results.append(out["embedding"])
        return results


embedding_model = NativeGoogleEmbedding()

metrics = [answer_relevancy, faithfulness, context_precision]
for m in metrics:
    if hasattr(m, "embeddings"):
        m.embeddings = embedding_model



eval_json_path = "/kaggle/input/rag-evalp2/rag_p2.json"

with open(eval_json_path, "r") as f:
    items = json.load(f)

dataset = Dataset.from_list([
    {
        "question": item["question"],
        "contexts": item["contexts"],
        "answer": item["answer"],
        "ground_truth": item["ground_truth"]
    }
    for item in items
])

TOTAL_ROWS = len(dataset)
print("Dataset loaded. Total rows:", TOTAL_ROWS)


output_csv = "ragas_gemini_eval_results.csv"
backup_jsonl = "ragas_stepwise.jsonl"

pd.DataFrame().to_csv(output_csv, index=False)
open(backup_jsonl, "w").close()


current_row = 0

for key_id in KEY_ORDER:

    if current_row >= TOTAL_ROWS:
        break


    print(f" using API KEY #{key_id}")

    ACTIVE_KEY = user_secrets.get_secret(key_id)

    if not ACTIVE_KEY:
        raise ValueError(f"API KEY {key_id} NOT FOUND!")

    # Configure API key for this batch
    os.environ["GOOGLE_API_KEY"] = ACTIVE_KEY
    configure(api_key=ACTIVE_KEY)

    # Setup LLM
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        max_output_tokens=2048
    )
    ragas_llm = LangchainLLMWrapper(llm)

    for m in metrics:
        m.llm = ragas_llm

    # Run 7 questions for this key
    batch_end = min(current_row + QUESTIONS_PER_KEY, TOTAL_ROWS)

    while current_row < batch_end:

        print(f"\nRow {current_row+1}/{TOTAL_ROWS} (KEY {key_id})")

        try:
            result = evaluate(
                dataset=Dataset.from_dict(dataset[current_row:current_row+1]),
                metrics=metrics,
                raise_exceptions=False
            )

            df_step = result.to_pandas()

            df_step.to_csv(
                output_csv,
                mode="a",
                header=not os.path.getsize(output_csv),
                index=False
            )

            record = {
                "question": dataset[current_row]["question"],
                "answer": dataset[current_row]["answer"],
                "ground_truth": dataset[current_row]["ground_truth"],
                "answer_relevancy": df_step["answer_relevancy"].iloc[0],
                "faithfulness": df_step["faithfulness"].iloc[0],
                "context_precision": df_step["context_precision"].iloc[0]
            }

            with open(backup_jsonl, "a") as f:
                f.write(json.dumps(record) + "\n")

            print(f"Saved row {current_row+1}")

        except Exception as e:
            print(f"ERROR at row {current_row+1}: {e}")

        print(f"Cooldown {WAIT_TIME} sec...\n")
        time.sleep(WAIT_TIME)

        current_row += 1


print("\n evaluation done ! demyn ")



Starting ...
Dataset loaded. Total rows: 35
 using API KEY #1

Row 1/35 (KEY 1)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[1]: OutputParserException(Invalid json output: {"statements": ["Caesar first enters the play in Act 1, Scene 2.", "Antony enters the play in Act 1, Scene 2.", "Calphurnia enters the play in Act 1, Scene 2.", "Portia enters the play in Act 1, Scene 2.", "Decius enters the play in Act 1, Scene 2.", "Cicero enters the play in Act 1, Scene 2.", "Brutus enters the play in Act 1, Scene 2.", "Cassius enters the play in Act 1, Scene 2.", "Casca enters the play in Act 1, Scene 2.", "A Soothsayer enters the play in Act 1, Scene 2.", "Marullus enters the play in Act 1, Scene 2.", "Flavius enters the play in Act 1, Scene 2.", "Commoners enter the play in Act 1, Scene 2.", "Caesar\'s train enters the play in Act 1, Scene 2."]}
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE )


Saved row 1
Cooldown 61 sec...


Row 2/35 (KEY 1)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 2
Cooldown 61 sec...


Row 3/35 (KEY 1)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 3
Cooldown 61 sec...


Row 4/35 (KEY 1)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 4
Cooldown 61 sec...


Row 5/35 (KEY 1)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 5
Cooldown 61 sec...


Row 6/35 (KEY 1)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 6
Cooldown 61 sec...


Row 7/35 (KEY 1)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details..


Saved row 7
Cooldown 61 sec...

 using API KEY #2

Row 8/35 (KEY 2)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 8
Cooldown 61 sec...


Row 9/35 (KEY 2)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 9
Cooldown 61 sec...


Row 10/35 (KEY 2)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 10
Cooldown 61 sec...


Row 11/35 (KEY 2)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 11
Cooldown 61 sec...


Row 12/35 (KEY 2)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 12
Cooldown 61 sec...


Row 13/35 (KEY 2)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 13
Cooldown 61 sec...


Row 14/35 (KEY 2)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 14
Cooldown 61 sec...

 using API KEY #3

Row 15/35 (KEY 3)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 15
Cooldown 61 sec...


Row 16/35 (KEY 3)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 16
Cooldown 61 sec...


Row 17/35 (KEY 3)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 17
Cooldown 61 sec...


Row 18/35 (KEY 3)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 18
Cooldown 61 sec...


Row 19/35 (KEY 3)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 19
Cooldown 61 sec...


Row 20/35 (KEY 3)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 20
Cooldown 61 sec...


Row 21/35 (KEY 3)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details..
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 21
Cooldown 61 sec...

 using API KEY #4

Row 22/35 (KEY 4)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 22
Cooldown 61 sec...


Row 23/35 (KEY 4)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 23
Cooldown 61 sec...


Row 24/35 (KEY 4)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 24
Cooldown 61 sec...


Row 25/35 (KEY 4)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 25
Cooldown 61 sec...


Row 26/35 (KEY 4)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 26
Cooldown 61 sec...


Row 27/35 (KEY 4)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 27
Cooldown 61 sec...


Row 28/35 (KEY 4)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 28
Cooldown 61 sec...

 using API KEY #5

Row 29/35 (KEY 5)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 29
Cooldown 61 sec...


Row 30/35 (KEY 5)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 30
Cooldown 61 sec...


Row 31/35 (KEY 5)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 31
Cooldown 61 sec...


Row 32/35 (KEY 5)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 32
Cooldown 61 sec...


Row 33/35 (KEY 5)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 33
Cooldown 61 sec...


Row 34/35 (KEY 5)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 34
Cooldown 61 sec...


Row 35/35 (KEY 5)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Saved row 35
Cooldown 61 sec...


 evaluation done ! demyn 


In [2]:
!!pip install ragas langchain_google_genai google-generativeai datasets pandas

import os
import time
import json
import pandas as pd
from datasets import Dataset
from kaggle_secrets import UserSecretsClient

# RAGAS
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_precision
from ragas.llms import LangchainLLMWrapper

# Gemini
from langchain_google_genai import ChatGoogleGenerativeAI
from google.generativeai import configure, embed_content

print("Starting ...")
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*LangchainLLMWrapper.*")



KEY_ORDER = ["1", "2", "3", "4", "5"]     # 5 keys
QUESTIONS_PER_KEY = 7                  # 7 rows
WAIT_TIME = 61                          # wait time 


user_secrets = UserSecretsClient()

for key_id in KEY_ORDER:
    ACTIVE_KEY = user_secrets.get_secret(key_id)
    print(f"got {key_id}")

Starting ...
got 1
got 2
got 3
got 4
got 5
