In [None]:
!pip install -q groq
!pip install -q together

In [None]:
import gspread
import pandas as pd
import numpy as np
from itertools import cycle
# from groq import Groq
from together import Together
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()


# token1 = user_secrets.get_secret("GROQ_TOKEN1")
# token2 = user_secrets.get_secret("GROQ_TOKEN2")
# groq_clients = [
    # Groq(api_key=token1"),
    # Groq(api_key=token2"),
# ]
# # Cycle through clients after every 80 requests
# client_cycle = cycle(groq_clients)





together_token = user_secrets.get_secret("TOGETHER_TOKEN")
together_clients = [
    Together(api_key=together_token),
]
# Cycle through clients after every 80 requests
client_cycle = cycle(together_clients)


# Function to get completion with batching
def get_completion_3_1(client, messages: list, model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"):
    # global client_cycle
    # client = next(client_cycle)
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
        max_tokens=1024,
        top_p=1,
        stream=False,
        stop=["<|eot_id|>","<|eom_id|>"],
    )
    return response.choices[0].message.content


def initialize_google_sheets():
    # We set the necessary permissions to access the Google Sheets API
    google_client = gspread.service_account(filename="google_cloud_service_account.json")

    # We open the Google Sheet
    spreadsheet = google_client.open_by_url(
        'https://docs.google.com/spreadsheets/d/your_file_id')

    # We take the first page
    return spreadsheet.get_worksheet(0)


def sheets_to_df():
    worksheet = initialize_google_sheets()
    # We extract the data and transform it into pandas DataFrame
    data = worksheet.get_all_records()  # This gets the data as a list
    df = pd.DataFrame(data)  # We convert the data to DataFrame
    return df

def load_data_to_the_sheet(sheet_df):
    # Clear empty or invalid data
    sheet_df = sheet_df.replace([float("nan"), float("inf"), float("-inf")], None)

    # Write to Google Sheets
    headers = sheet_df.columns.tolist()
    updated_data = sheet_df.values.tolist()

    worksheet = initialize_google_sheets()
    worksheet.clear()
    worksheet.append_row(headers)
    worksheet.append_rows(updated_data)
    print("Data updated successfully.")

def save_the_score(sheet_df, score: dict):
    """
    Writes the given data to Google Sheets and automatically determines the model's column index.

    Args:
        sheet_df (pd.DataFrame): The DataFrame representing the Google Sheets data.
        score (dict): A dictionary containing the score value, row index, and column index.
    """

    # sheet_df = sheets_to_df()
    # columns = sheet_df.columns.tolist()
    if sheet_df.loc[score['row'], score['score_column']] in [np.nan, None, ""]:
        sheet_df.loc[score['row'], score['score_column']] = score['score']


In [None]:
# LangChain configuration
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser




# Output schema
summary_schema = ResponseSchema(
    name="score",
    description="An integer score between 1 and 10.",
    type="integer"
)

response_schemas = [summary_schema]
# Structured output parser
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Receive format instructions
format_instructions = output_parser.get_format_instructions()

# Evaluation prompt template based on G-Eval
EVALUATION_PROMPT_TEMPLATE = """
You will be given one summary written for an source text. Your task is to rate the summary on one metric.
Please make sure you read and understand these instructions very carefully.
Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:
{criteria}

Evaluation Steps:
{steps}

Source Text:
```
{document}
```
Summary:
```
{summary}
```

Evaluation Form (scores ONLY):

- {metric_name}

{format_instructions}
"""

# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
Relevance(1-10) - selection of important content from the source. \
The summary should include only important information from the source document. \
Annotators were instructed to penalize summaries which contained redundancies and excess information.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 10.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """
Coherence(1-10) - the collective quality of all sentences. \
We align this dimension with the DUC quality question of structure and coherence \
whereby "the summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a\
coherent body of information about a topic."
"""

COHERENCE_SCORE_STEPS = """
1. Read the article carefully and identify the main topic and key points.
2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 10, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 3: Consistency

CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-10) - the factual alignment between the summary and the summarized source. \
A factually consistent summary contains only statements that are entailed by the source document. \
Annotators were also asked to penalize summaries that contained hallucinated facts.
"""

CONSISTENCY_SCORE_STEPS = """
1. Read the article carefully and identify the main facts and details it presents.
2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
3. Assign a score for consistency based on the Evaluation Criteria.
"""

# Metric 4: Fluency

FLUENCY_SCORE_CRITERIA = """
Fluency(1-10): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
3: Good. The summary has few or no errors and is easy to read and follow.
"""

FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 10.
"""


In [None]:
requests_per_client = 40


def get_geval_score(
    client, criteria: str, steps: str, document: str, summary: str, metric_name: str, format_instructions
):
    """
    Generates a Geval score by evaluating a summary based on specified criteria and evaluation steps.

    Args:
        client: The client used for model inference.
        criteria (str): The evaluation criteria for assessing the summary.
        steps (str): The step-by-step evaluation instructions.
        document (str): The original document to be summarized.
        summary (str): The generated summary to be evaluated.
        metric_name (str): The name of the evaluation metric.
        format_instructions (str): Formatting instructions for the evaluation prompt.

    Returns:
        str or None: The model-generated evaluation score if successful, otherwise None.
    """
    prompt = ChatPromptTemplate.from_template(template=EVALUATION_PROMPT_TEMPLATE)
    prompt = prompt.format_messages(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
        format_instructions=format_instructions,
    )

    messages = [{"role": "user", "content": prompt[0].content}]

    # Call the model and get the answer
    try:
        response = get_completion_3_1(client, messages)
        return response
    except Exception as e:
        print(f"Error during inference: {e}")
        return None










def batch_evaluation(sheet_df):
    """
    Performs batch evaluation of summaries using multiple evaluation metrics
    and updates the Google Sheets DataFrame with the computed scores.

    Args:
        sheet_df (pd.DataFrame): A DataFrame containing summaries and corresponding prompts.

    Returns:
        None: The function updates the DataFrame in place and saves progress to Google Sheets.
    """
    request_counter = 0
    evaluation_metrics = {
        "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
        "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
        "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
        "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS),
    }
    global client_cycle
    client = next(client_cycle)
    score_columns = [col for col in sheet_df.columns if col.startswith('Score_')]
    for score_column in score_columns:
        for row in range(len(sheet_df)):
            try:
                # Proceed only if both the prompt and the corresponding summary exist,
                # and the score has not been previously computed.
                if sheet_df.loc[row, 'prompt'] and sheet_df.iloc[row, sheet_df.columns.get_loc(score_column) - 1] and not sheet_df.loc[row, score_column]:
                    metric_scores = []
                    for eval_type, (criteria, steps) in evaluation_metrics.items():
                        if request_counter >= requests_per_client:
                            client = next(client_cycle)
                            request_counter = 0
                        response = get_geval_score(client, criteria, steps, sheet_df.loc[row, 'prompt'], sheet_df.iloc[row, sheet_df.columns.get_loc(score_column) - 1], eval_type, format_instructions)
                        if response is None:
                            print("Saving progress due to an error...")
                            load_data_to_the_sheet(sheet_df)  # Export saved data
                            return  # Stop the process
                        request_counter += 1
                        print( response, type(response))
                        output_dict = output_parser.parse(response)
                        if isinstance(output_dict, dict):
                            metric_scores.append(str(output_dict.get('score')))
                    sheet_df.loc[row, score_column] = sheet_df.loc[row, score_column] = '"'+','.join(metric_scores) + '"'
            except Exception as e:
                print(f"Unexpected error at row {row}: {e}")
                sheet_df.loc[row, score_column] = None

    load_data_to_the_sheet(sheet_df)

# Call batch processing
sheet_df = sheets_to_df()
batch_evaluation(sheet_df)
