In [1]:
import os

PROJECT_DIR = ''

IS_GOOGLE_COLAB_ENABLED = False

if IS_GOOGLE_COLAB_ENABLED:
    print('Google Colab is enabled. Running on Google Colab.')

    from google.colab import drive
    # Mount Google Drive
    drive.mount('/content/drive', force_remount=True)

    PROJECT_DIR = '/content/drive/MyDrive/Colab/'

    # Ensure the directory exists
    os.makedirs(PROJECT_DIR, exist_ok=True)

else:
    print('Google Colab is not enabled. Running locally.')

    # Local configuration and folder setup
    PROJECT_DIR = '/Users/ondergormez/Repos/BLM5109_Collective_Learning/03-Project/'

LLM_ANSWERS_DIR = PROJECT_DIR + 'llm_answers'
RAGAS_SCORES_DIR = PROJECT_DIR + 'ragas_scores'

# Ensure the directory exists
os.makedirs(LLM_ANSWERS_DIR, exist_ok=True)
os.makedirs(RAGAS_SCORES_DIR, exist_ok=True)

Google Colab is not enabled. Running locally.


# Logger Kullanımı

- Oluşturulan kod içerisinde loglama işlemleri için `logging` kütüphanesi kullanılmıştır.
- Böylelikle geliştirme aşamasında debug için koyulan kodların çıktılarına bakılarak hata ayıklama işlemleri yapılabilir. Bu kodların çıktıları rapor sunulurken kaldırılacaktır.


In [2]:
import logging
import logging.config
import yaml
import textwrap


class CustomFormatter(logging.Formatter):

    def format(self, record):
        wrapped_message = super().format(record)

        # wrap the log messages to 120 characters to better fit the screen
        if (len(wrapped_message) > 120):
            wrapped_message = "\n".join(textwrap.wrap(wrapped_message, width=120))

            # indent the wrapped lines but not the first line
            first_line, rest = wrapped_message.split('\n', 1)
            wrapped_message = first_line + '\n' + textwrap.indent(rest, ' ' * 4) + '\n'

        return wrapped_message


LOGGER_CONFIG_FILE = PROJECT_DIR + 'logger_config.yaml'

# Load the YAML configuration
with open(LOGGER_CONFIG_FILE, 'r') as file:
    config = yaml.safe_load(file.read())
    logging.config.dictConfig(config)

# Apply the custom formatter to the selected handler
logger = logging.getLogger("default_logger")
for handler in logger.handlers:
    if isinstance(handler, logging.StreamHandler):
        handler.setFormatter(CustomFormatter(handler.formatter._fmt, datefmt=handler.formatter.datefmt))

# TODO: Set the log level to INFO
# logger.setLevel(logging.DEBUG)
logger.setLevel(logging.INFO)


logger.debug("This is a debug message that will be wrapped after 140 characters. " * 10)
logger.info("Logging is configured successfully.")

[INFO] [2025-01-28T00:47:17.693Z] Logging is configured successfully.


In [3]:
cosmos_model_name = "cosmos_dpo"
cosmos_llm_answers_task_b_file_names = [
    '2025_01_27-13_23_49_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_1.csv',
    '2025_01_27-13_27_58_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_8.csv',
    '2025_01_27-13_32_15_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_15.csv',
]

cosmos_llm_answers_task_b_file_names

['2025_01_27-13_23_49_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_1.csv',
 '2025_01_27-13_27_58_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_8.csv',
 '2025_01_27-13_32_15_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_15.csv']

In [4]:
import time


def save_scores(llm_answers_file_name, llm_answers_df):
    # add time stamp to each file name for unique file names and further analysis
    time_stamp = time.strftime("%Y_%m_%d-%H_%M_%S")

    file_name = f'{RAGAS_SCORES_DIR}/{llm_answers_file_name}_{time_stamp}_ragas_scores.csv'

    llm_answers_df.to_csv(file_name, index=True)
    logger.info(f"Scores are saved to the file: '{file_name}'")

    return file_name

In [5]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness, answer_relevancy
import pandas as pd

import os

# TODO: Replace the api key with the correct one from https://platform.openai.com/settings/organization/api-keys
# os.environ["OPENAI_API_KEY"] = "Your API Key"


def get_ragas_scores(file_names):
    ragas_scores_file_names = []

    model_name_for_ragas_score_eval = ''
    for file_name in file_names:
        logger.info(f"Evaluating ragas score for the file: {file_name}")

        llm_answers_df = pd.read_csv(LLM_ANSWERS_DIR + '/' + file_name)
        logger.info("Shape of the dataframe: " + str(llm_answers_df.shape))
        print()

        # Add new columns to store the metrics
        llm_answers_df[f'faithfulness'] = ""
        llm_answers_df['answer_correctness'] = ""
        llm_answers_df['answer_relevancy'] = ""

        if 'cosmos' in file_name:
            model_name_for_ragas_score_eval = cosmos_model_name
        elif 'gemma' in file_name:
            model_name_for_ragas_score_eval = gemma_model_name

        for i in range(len(llm_answers_df)):
            data_sample = {
                'question': [
                    llm_answers_df.iloc[i]['question'],
                ],
                'ground_truth': [
                    llm_answers_df.iloc[i]['answer'],
                ],
                'answer': [
                    llm_answers_df.iloc[i][f'{model_name_for_ragas_score_eval}_answer'],
                ],
                'contexts': [
                    [
                        llm_answers_df.iloc[i]['context'],
                    ],
                ]
            }

            dataset = Dataset.from_dict(data_sample)
            score = evaluate(dataset, metrics=[faithfulness, answer_correctness, answer_relevancy])
            logger.info(f"Scores for the question {i + 1} of {len(llm_answers_df)}\n: {score}")

            score_df = score.to_pandas()

            chunk_unique_id = llm_answers_df.iloc[i]['id']
            index_of_chunk = llm_answers_df['id'] == chunk_unique_id

            llm_answers_df.loc[index_of_chunk, 'faithfulness'] = score_df.iloc[0].get('faithfulness')
            llm_answers_df.loc[index_of_chunk, 'answer_correctness'] = score_df.iloc[0].get('answer_correctness')
            llm_answers_df.loc[index_of_chunk, 'answer_relevancy'] = score_df.iloc[0].get('answer_relevancy')

        # get just file name without path and extension
        temp_fn = os.path.splitext(os.path.basename(file_name))[0]

        fn = save_scores(temp_fn, llm_answers_df)
        ragas_scores_file_names.append(fn)

    return ragas_scores_file_names

In [6]:
ragas_scores_file_names = get_ragas_scores(cosmos_llm_answers_task_b_file_names)

[INFO] [2025-01-28T00:47:18.643Z] Evaluating ragas score for the file:
    2025_01_27-13_23_49_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_1.csv

[INFO] [2025-01-28T00:47:18.651Z] Shape of the dataframe: (50, 11)



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:47:45.810Z] Scores for the question 1 of 50 : {'faithfulness': 0.8889, 'answer_correctness':
    0.3223, 'answer_relevancy': 0.7940}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:48:05.867Z] Scores for the question 2 of 50 : {'faithfulness': 0.4000, 'answer_correctness':
    0.6134, 'answer_relevancy': 0.8788}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:48:31.800Z] Scores for the question 3 of 50 : {'faithfulness': 0.0000, 'answer_correctness':
    0.3854, 'answer_relevancy': 0.8655}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:48:55.697Z] Scores for the question 4 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4484, 'answer_relevancy': 0.8575}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:49:12.323Z] Scores for the question 5 of 50 : {'faithfulness': 0.2500, 'answer_correctness':
    0.6785, 'answer_relevancy': 0.9130}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:49:45.437Z] Scores for the question 6 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.2905, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:50:05.515Z] Scores for the question 7 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3498, 'answer_relevancy': 0.7249}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:50:29.789Z] Scores for the question 8 of 50 : {'faithfulness': 0.9000, 'answer_correctness':
    0.3427, 'answer_relevancy': 0.8724}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:50:47.178Z] Scores for the question 9 of 50 : {'faithfulness': 0.8333, 'answer_correctness':
    0.3614, 'answer_relevancy': 0.7998}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:51:15.403Z] Scores for the question 10 of 50 : {'faithfulness': 0.8889, 'answer_correctness':
    0.5439, 'answer_relevancy': 0.8247}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:51:35.055Z] Scores for the question 11 of 50 : {'faithfulness': 0.6000, 'answer_correctness':
    0.5755, 'answer_relevancy': 0.8740}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:52:06.601Z] Scores for the question 12 of 50 : {'faithfulness': 0.9231, 'answer_correctness':
    0.4303, 'answer_relevancy': 0.8520}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:52:23.817Z] Scores for the question 13 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4254, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:54:26.740Z] Scores for the question 14 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3165, 'answer_relevancy': 0.8590}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:54:47.027Z] Scores for the question 15 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5375, 'answer_relevancy': 0.8753}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:55:04.369Z] Scores for the question 16 of 50 : {'faithfulness': 0.8000, 'answer_correctness':
    0.4009, 'answer_relevancy': 0.8664}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:55:30.104Z] Scores for the question 17 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5697, 'answer_relevancy': 0.8586}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:55:53.687Z] Scores for the question 18 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3153, 'answer_relevancy': 0.7609}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:56:15.813Z] Scores for the question 19 of 50 : {'faithfulness': 0.9231, 'answer_correctness':
    0.6427, 'answer_relevancy': 0.8014}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:56:37.418Z] Scores for the question 20 of 50 : {'faithfulness': 0.9000, 'answer_correctness':
    0.3273, 'answer_relevancy': 0.7881}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:57:00.791Z] Scores for the question 21 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.2122, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:57:24.141Z] Scores for the question 22 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3204, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:57:49.533Z] Scores for the question 23 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4437, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:58:14.431Z] Scores for the question 24 of 50 : {'faithfulness': 0.6667, 'answer_correctness':
    0.3458, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:58:38.874Z] Scores for the question 25 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3531, 'answer_relevancy': 0.8398}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:58:52.711Z] Scores for the question 26 of 50 : {'faithfulness': 0.3333, 'answer_correctness':
    0.6406, 'answer_relevancy': 0.7535}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:59:05.017Z] Scores for the question 27 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8434, 'answer_relevancy': 0.8895}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:59:24.884Z] Scores for the question 28 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9753, 'answer_relevancy': 0.8123}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T00:59:49.407Z] Scores for the question 29 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6748, 'answer_relevancy': 0.8072}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:00:09.002Z] Scores for the question 30 of 50 : {'faithfulness': 0.3333, 'answer_correctness':
    0.2178, 'answer_relevancy': 0.7920}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:00:41.529Z] Scores for the question 31 of 50 : {'faithfulness': 0.5000, 'answer_correctness':
    0.4212, 'answer_relevancy': 0.7831}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:00:53.422Z] Scores for the question 32 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9941, 'answer_relevancy': 0.8774}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:01:19.084Z] Scores for the question 33 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5615, 'answer_relevancy': 0.7901}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:01:45.945Z] Scores for the question 34 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3059, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:02:07.282Z] Scores for the question 35 of 50 : {'faithfulness': 0.5000, 'answer_correctness':
    0.3183, 'answer_relevancy': 0.7901}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:02:19.963Z] Scores for the question 36 of 50 : {'faithfulness': 0.0000, 'answer_correctness':
    0.8263, 'answer_relevancy': 0.8780}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:02:54.348Z] Scores for the question 37 of 50 : {'faithfulness': 0.7500, 'answer_correctness':
    0.6344, 'answer_relevancy': 0.8717}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Exception raised in Job[1]: TimeoutError()


[INFO] [2025-01-28T01:05:55.260Z] Scores for the question 38 of 50 : {'faithfulness': 1.0000, 'answer_correctness': nan,
    'answer_relevancy': 0.8464}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:06:17.103Z] Scores for the question 39 of 50 : {'faithfulness': 0.9286, 'answer_correctness':
    0.7198, 'answer_relevancy': 0.7309}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:06:39.019Z] Scores for the question 40 of 50 : {'faithfulness': 0.4286, 'answer_correctness':
    0.7762, 'answer_relevancy': 0.8599}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:07:05.321Z] Scores for the question 41 of 50 : {'faithfulness': 0.9231, 'answer_correctness':
    0.2884, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:07:21.216Z] Scores for the question 42 of 50 : {'faithfulness': 0.3333, 'answer_correctness':
    0.6124, 'answer_relevancy': 0.9234}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:07:37.702Z] Scores for the question 43 of 50 : {'faithfulness': 0.4000, 'answer_correctness':
    0.7007, 'answer_relevancy': 0.9134}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:07:53.388Z] Scores for the question 44 of 50 : {'faithfulness': 0.8750, 'answer_correctness':
    0.2029, 'answer_relevancy': 0.7997}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:08:17.704Z] Scores for the question 45 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3596, 'answer_relevancy': 0.7360}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:08:47.961Z] Scores for the question 46 of 50 : {'faithfulness': 0.9231, 'answer_correctness':
    0.2960, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:09:17.883Z] Scores for the question 47 of 50 : {'faithfulness': 0.6667, 'answer_correctness':
    0.5559, 'answer_relevancy': 0.8595}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:09:35.737Z] Scores for the question 48 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3700, 'answer_relevancy': 0.7558}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:10:11.238Z] Scores for the question 49 of 50 : {'faithfulness': 0.9231, 'answer_correctness':
    0.4170, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:10:30.625Z] Scores for the question 50 of 50 : {'faithfulness': 0.9091, 'answer_correctness':
    0.3342, 'answer_relevancy': 0.7338}

[INFO] [2025-01-28T01:10:30.640Z] Scores are saved to the file: '/Users/ondergormez/Repos/BLM5109_Collective_Learning/03
    -Project/ragas_scores/2025_01_27-
    13_23_49_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_1_2025_01_28-
    01_10_30_ragas_scores.csv'

[INFO] [2025-01-28T01:10:30.641Z] Evaluating ragas score for the file:
    2025_01_27-13_27_58_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_8.csv

[INFO] [2025-01-28T01:10:30.653Z] Shape of the dataframe: (50, 11)



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:10:52.080Z] Scores for the question 1 of 50 : {'faithfulness': 0.3333, 'answer_correctness':
    0.5706, 'answer_relevancy': 0.9148}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:11:05.805Z] Scores for the question 2 of 50 : {'faithfulness': 0.6667, 'answer_correctness':
    0.4204, 'answer_relevancy': 0.8450}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:11:32.467Z] Scores for the question 3 of 50 : {'faithfulness': 0.4167, 'answer_correctness':
    0.6121, 'answer_relevancy': 0.8755}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:11:51.783Z] Scores for the question 4 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5370, 'answer_relevancy': 0.8431}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:12:07.480Z] Scores for the question 5 of 50 : {'faithfulness': 0.4286, 'answer_correctness':
    0.4773, 'answer_relevancy': 0.8582}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:12:24.356Z] Scores for the question 6 of 50 : {'faithfulness': 0.3333, 'answer_correctness':
    0.6981, 'answer_relevancy': 0.8317}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:12:35.406Z] Scores for the question 7 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8404, 'answer_relevancy': 0.8219}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:12:46.218Z] Scores for the question 8 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6172, 'answer_relevancy': 0.8817}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:13:03.004Z] Scores for the question 9 of 50 : {'faithfulness': 0.5000, 'answer_correctness':
    0.4468, 'answer_relevancy': 0.7866}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:13:19.578Z] Scores for the question 10 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9912, 'answer_relevancy': 0.8737}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:13:33.910Z] Scores for the question 11 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6862, 'answer_relevancy': 0.8741}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:13:54.067Z] Scores for the question 12 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8847, 'answer_relevancy': 0.8721}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:14:13.274Z] Scores for the question 13 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6905, 'answer_relevancy': 0.9844}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:14:33.109Z] Scores for the question 14 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3002, 'answer_relevancy': 0.7210}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:14:53.647Z] Scores for the question 15 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8371, 'answer_relevancy': 0.8988}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:15:12.477Z] Scores for the question 16 of 50 : {'faithfulness': 0.0000, 'answer_correctness':
    0.6150, 'answer_relevancy': 0.8709}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:15:28.560Z] Scores for the question 17 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5299, 'answer_relevancy': 0.8369}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:15:39.275Z] Scores for the question 18 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9965, 'answer_relevancy': 0.8673}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:15:53.233Z] Scores for the question 19 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6422, 'answer_relevancy': 0.8610}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:16:05.892Z] Scores for the question 20 of 50 : {'faithfulness': 0.5000, 'answer_correctness':
    0.5287, 'answer_relevancy': 0.8693}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:16:16.814Z] Scores for the question 21 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.2233, 'answer_relevancy': 0.8491}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:16:31.411Z] Scores for the question 22 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6159, 'answer_relevancy': 0.8658}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:16:48.252Z] Scores for the question 23 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6406, 'answer_relevancy': 0.8252}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:17:00.430Z] Scores for the question 24 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8017, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:17:11.754Z] Scores for the question 25 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4857, 'answer_relevancy': 0.8089}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:17:31.345Z] Scores for the question 26 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3332, 'answer_relevancy': 0.8043}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:17:43.379Z] Scores for the question 27 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.7408, 'answer_relevancy': 0.8895}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:18:10.491Z] Scores for the question 28 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9796, 'answer_relevancy': 0.7663}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:18:32.299Z] Scores for the question 29 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8454, 'answer_relevancy': 0.8523}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:18:54.771Z] Scores for the question 30 of 50 : {'faithfulness': 0.7692, 'answer_correctness':
    0.2183, 'answer_relevancy': 0.7920}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:19:18.404Z] Scores for the question 31 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3911, 'answer_relevancy': 0.7485}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:19:30.113Z] Scores for the question 32 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5360, 'answer_relevancy': 0.8414}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:19:47.538Z] Scores for the question 33 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.7999, 'answer_relevancy': 0.7767}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:20:03.538Z] Scores for the question 34 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3953, 'answer_relevancy': 0.9560}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:20:25.543Z] Scores for the question 35 of 50 : {'faithfulness': 0.1111, 'answer_correctness':
    0.3301, 'answer_relevancy': 0.7882}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:20:46.445Z] Scores for the question 36 of 50 : {'faithfulness': 0.9091, 'answer_correctness':
    0.2063, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:21:02.112Z] Scores for the question 37 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5350, 'answer_relevancy': 0.8485}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:21:15.662Z] Scores for the question 38 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8790, 'answer_relevancy': 0.8994}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:21:38.262Z] Scores for the question 39 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6578, 'answer_relevancy': 0.7662}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:21:59.986Z] Scores for the question 40 of 50 : {'faithfulness': 0.6000, 'answer_correctness':
    0.4401, 'answer_relevancy': 0.8599}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:22:30.937Z] Scores for the question 41 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4288, 'answer_relevancy': 0.7303}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:22:54.025Z] Scores for the question 42 of 50 : {'faithfulness': 0.5714, 'answer_correctness':
    0.6373, 'answer_relevancy': 0.8656}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:23:10.206Z] Scores for the question 43 of 50 : {'faithfulness': 0.8333, 'answer_correctness':
    0.6153, 'answer_relevancy': 0.9134}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:23:32.386Z] Scores for the question 44 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6709, 'answer_relevancy': 0.9541}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:23:44.221Z] Scores for the question 45 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8417, 'answer_relevancy': 0.8929}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:24:03.493Z] Scores for the question 46 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.7373, 'answer_relevancy': 0.8423}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:24:19.619Z] Scores for the question 47 of 50 : {'faithfulness': 0.8000, 'answer_correctness':
    0.4822, 'answer_relevancy': 0.8468}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:24:30.107Z] Scores for the question 48 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.7310, 'answer_relevancy': 0.7997}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:24:44.695Z] Scores for the question 49 of 50 : {'faithfulness': 0.6667, 'answer_correctness':
    0.5817, 'answer_relevancy': 0.8672}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:24:55.416Z] Scores for the question 50 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9926, 'answer_relevancy': 0.9193}

[INFO] [2025-01-28T01:24:55.424Z] Scores are saved to the file: '/Users/ondergormez/Repos/BLM5109_Collective_Learning/03
    -Project/ragas_scores/2025_01_27-
    13_27_58_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_8_2025_01_28-
    01_24_55_ragas_scores.csv'

[INFO] [2025-01-28T01:24:55.424Z] Evaluating ragas score for the file:
    2025_01_27-13_32_15_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_15.csv

[INFO] [2025-01-28T01:24:55.432Z] Shape of the dataframe: (50, 11)



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:25:13.834Z] Scores for the question 1 of 50 : {'faithfulness': 0.5000, 'answer_correctness':
    0.6897, 'answer_relevancy': 0.8724}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:25:28.688Z] Scores for the question 2 of 50 : {'faithfulness': 0.7500, 'answer_correctness':
    0.6481, 'answer_relevancy': 0.8335}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:25:52.356Z] Scores for the question 3 of 50 : {'faithfulness': 0.7273, 'answer_correctness':
    0.2147, 'answer_relevancy': 0.8475}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:26:07.945Z] Scores for the question 4 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.7428, 'answer_relevancy': 0.8416}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:26:27.575Z] Scores for the question 5 of 50 : {'faithfulness': 0.2500, 'answer_correctness':
    0.7738, 'answer_relevancy': 0.8582}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:26:43.397Z] Scores for the question 6 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5632, 'answer_relevancy': 0.8136}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:26:55.551Z] Scores for the question 7 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4863, 'answer_relevancy': 0.8340}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:27:05.798Z] Scores for the question 8 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6158, 'answer_relevancy': 0.8817}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:27:19.251Z] Scores for the question 9 of 50 : {'faithfulness': 0.3333, 'answer_correctness':
    0.5674, 'answer_relevancy': 0.8642}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:27:30.722Z] Scores for the question 10 of 50 : {'faithfulness': 0.8000, 'answer_correctness':
    0.6681, 'answer_relevancy': 0.8737}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:27:48.388Z] Scores for the question 11 of 50 : {'faithfulness': 0.7143, 'answer_correctness':
    0.6138, 'answer_relevancy': 0.8533}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:28:05.302Z] Scores for the question 12 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9915, 'answer_relevancy': 0.8767}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:28:29.528Z] Scores for the question 13 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6410, 'answer_relevancy': 0.9270}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:28:43.058Z] Scores for the question 14 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6611, 'answer_relevancy': 0.8592}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:29:03.643Z] Scores for the question 15 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5336, 'answer_relevancy': 0.8990}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:29:25.545Z] Scores for the question 16 of 50 : {'faithfulness': 0.7143, 'answer_correctness':
    0.4664, 'answer_relevancy': 0.8620}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:29:40.656Z] Scores for the question 17 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5660, 'answer_relevancy': 0.8616}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:29:53.049Z] Scores for the question 18 of 50 : {'faithfulness': 0.1667, 'answer_correctness':
    0.6068, 'answer_relevancy': 0.8594}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:30:15.613Z] Scores for the question 19 of 50 : {'faithfulness': 0.8750, 'answer_correctness':
    0.7814, 'answer_relevancy': 0.8579}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:30:30.137Z] Scores for the question 20 of 50 : {'faithfulness': 0.6000, 'answer_correctness':
    0.8164, 'answer_relevancy': 0.9155}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:30:41.859Z] Scores for the question 21 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.2203, 'answer_relevancy': 0.8656}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:30:53.903Z] Scores for the question 22 of 50 : {'faithfulness': 0.6667, 'answer_correctness':
    0.5382, 'answer_relevancy': 0.8901}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:31:08.318Z] Scores for the question 23 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6632, 'answer_relevancy': 0.8252}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:31:23.203Z] Scores for the question 24 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6623, 'answer_relevancy': 0.0000}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:31:47.975Z] Scores for the question 25 of 50 : {'faithfulness': 0.4444, 'answer_correctness':
    0.4625, 'answer_relevancy': 0.8398}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:32:02.116Z] Scores for the question 26 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4089, 'answer_relevancy': 0.9143}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:32:15.714Z] Scores for the question 27 of 50 : {'faithfulness': 0.8000, 'answer_correctness':
    0.6893, 'answer_relevancy': 0.8895}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:32:36.677Z] Scores for the question 28 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.8876, 'answer_relevancy': 0.7696}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:32:58.966Z] Scores for the question 29 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4838, 'answer_relevancy': 0.8523}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:33:12.017Z] Scores for the question 30 of 50 : {'faithfulness': 0.6667, 'answer_correctness':
    0.2229, 'answer_relevancy': 0.9292}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:33:35.734Z] Scores for the question 31 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.4218, 'answer_relevancy': 0.7831}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:33:52.672Z] Scores for the question 32 of 50 : {'faithfulness': 0.4286, 'answer_correctness':
    0.3892, 'answer_relevancy': 0.8536}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:34:15.491Z] Scores for the question 33 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.3709, 'answer_relevancy': 0.8271}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:34:31.304Z] Scores for the question 34 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6430, 'answer_relevancy': 0.8030}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:34:51.675Z] Scores for the question 35 of 50 : {'faithfulness': 0.1111, 'answer_correctness':
    0.3800, 'answer_relevancy': 0.7828}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:35:00.134Z] Scores for the question 36 of 50 : {'faithfulness': 0.0000, 'answer_correctness':
    0.8316, 'answer_relevancy': 0.8156}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:35:21.941Z] Scores for the question 37 of 50 : {'faithfulness': 0.0000, 'answer_correctness':
    0.7862, 'answer_relevancy': 0.8717}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:35:39.663Z] Scores for the question 38 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6658, 'answer_relevancy': 0.8994}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:35:58.300Z] Scores for the question 39 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6892, 'answer_relevancy': 0.7578}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:36:23.310Z] Scores for the question 40 of 50 : {'faithfulness': 0.3333, 'answer_correctness':
    0.4982, 'answer_relevancy': 0.8581}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:36:42.153Z] Scores for the question 41 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5281, 'answer_relevancy': 0.8594}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:37:01.980Z] Scores for the question 42 of 50 : {'faithfulness': 0.2857, 'answer_correctness':
    0.6381, 'answer_relevancy': 0.8656}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:37:25.306Z] Scores for the question 43 of 50 : {'faithfulness': 0.8667, 'answer_correctness':
    0.4866, 'answer_relevancy': 0.9134}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:37:42.478Z] Scores for the question 44 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.5994, 'answer_relevancy': 0.8328}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:37:54.049Z] Scores for the question 45 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9923, 'answer_relevancy': 0.9139}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:38:06.143Z] Scores for the question 46 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9912, 'answer_relevancy': 0.8806}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:38:27.771Z] Scores for the question 47 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.6907, 'answer_relevancy': 0.8804}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:38:39.328Z] Scores for the question 48 of 50 : {'faithfulness': 0.5000, 'answer_correctness':
    0.7285, 'answer_relevancy': 0.8149}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:38:58.614Z] Scores for the question 49 of 50 : {'faithfulness': 0.6667, 'answer_correctness':
    0.5792, 'answer_relevancy': 0.8355}



Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] [2025-01-28T01:39:24.100Z] Scores for the question 50 of 50 : {'faithfulness': 1.0000, 'answer_correctness':
    0.9716, 'answer_relevancy': 0.0000}

[INFO] [2025-01-28T01:39:24.117Z] Scores are saved to the file: '/Users/ondergormez/Repos/BLM5109_Collective_Learning/03
    -Project/ragas_scores/2025_01_27-
    13_32_15_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_15_2025_01_28-
    01_39_24_ragas_scores.csv'



In [7]:
ragas_scores_file_names

['/Users/ondergormez/Repos/BLM5109_Collective_Learning/03-Project/ragas_scores/2025_01_27-13_23_49_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_1_2025_01_28-01_10_30_ragas_scores.csv',
 '/Users/ondergormez/Repos/BLM5109_Collective_Learning/03-Project/ragas_scores/2025_01_27-13_27_58_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_8_2025_01_28-01_24_55_ragas_scores.csv',
 '/Users/ondergormez/Repos/BLM5109_Collective_Learning/03-Project/ragas_scores/2025_01_27-13_32_15_cosmos_dpo_llm_answers_for_cntx_length_15_and_position_of_correct_answer_15_2025_01_28-01_39_24_ragas_scores.csv']