In [8]:
"""
RQS_eval.py

Reads a specified JSON file and iterates through each object's 'turns' array to find a turn where 'isuser' is true and its following turn.

Usage:
    python RQS_eval.py outputs/llm_responses.json outputs/llm_responses_rqs.json

Arguments:
    --file_path: Path to the JSON file.
    --output_path: Path where the modified JSON file will be saved.
"""

from tools.api_request import request_gemini as request_llm
from tools.db_detail import db_getdesc
from tools.sql_execute import sqlite_execute as execute
import json
from tqdm import tqdm

def sql_evoke(query,db_name):
    result, execution_time ,executable = execute("datasets/cosql_dataset/database/"+db_name+"/"+db_name+".sqlite",query)
    return result 

def get_example(db_name):
    sql_query = "SELECT name FROM sqlite_master WHERE type='table';"
    result = sql_evoke(sql_query,db_name)
    column_example=""
    for table_name in result:
        column_example = column_example + table_name[0] + ":\n"
        sql_get_eg = "SELECT * FROM "+ table_name[0] +" LIMIT 3;"
        table_eg = sql_evoke(sql_get_eg,db_name)
        for table_data in table_eg:
            column_example = column_example + '('
            for column_data in table_data: 
                column_example = column_example + str(column_data) +','
            column_example = column_example[:-1] + ')\n'
    return column_example
    

def ask_ai(db_name, question, answer_pred, answer_gold):
    description = db_getdesc(db_name)
    column_example = get_example(db_name)
    template = """
{database_description}

{user_question}

{system_response}

{reference_answer}

Evaluate the quality of the system's response based on the following criteria. Assign 2 points directly if a criterion does not apply.
Relevance:
0 points: The response is completely irrelevant.
1 point: The response is partially relevant but misses key details.
2 points: The response is fully relevant and addresses the question adequately.
Clarity:
0 points: The response is incomprehensible.
1 point: The response is mostly clear with minor ambiguities.
2 points: The response is very clear and easy to understand.
Completeness:
0 points: The response does not address the question at all.
1 point: The response covers most aspects of the question but lacks some details.
2 points: The response thoroughly addresses all aspects of the question.
Accuracy:
0 points: The response contains factually incorrect information.
1 point: The response is partially accurate with some errors.
2 points: The response is completely accurate.
Utility:
0 points: The response does not meet the user's needs or explain the context of the question.
1 point: The response somewhat meets the user's needs and provides partial explanations.
2 points: The response excellently meets the user's needs and clearly explains the context or ambiguity of the question.
Task:
Classify the Response: Determine if the system response is 'improper'(Non-SQL based user questions), 'unanswerable'(unachievable under existing conditions), or 'ambiguous'(Lack of clarity).
Evaluate Each Criterion: Provide a detailed rationale for the score assigned to each criterion.
Calculate the Total Score: Sum the scores for all criteria.

Output Format:
{{
  "AnswerType": "",
  "Rationale": "",
  "Score": ""
}}
    """
    filled_template = template.format(
        database_description="Database Description:"+ "\nDatabase schema:\n" + description + "\nExamples for each table:"+ column_example,
        user_question="User Question:" + question,
        system_response="System Response:" + answer_pred,
        reference_answer="Reference Answer:" + answer_gold
    )
    
    # print(filled_template)

    messages = [{"role": "user", "content": filled_template}]
    max_attempts = 10
    attempt = 0
    while attempt < max_attempts:
        llm_response = request_llm(messages)
        print("LLM Response:", llm_response)
        
        try:
            response_data = json.loads(llm_response)
            type_ai = response_data.get("AnswerType", "")
            rqs_ai = response_data.get("Score", 0)
            rationale_ai = response_data.get("Rationale", "")
            
            # 检查返回的类型和分数是否符合预期
            if type_ai in ["improper", "unanswerable", "ambiguous"] and int(rqs_ai) >= 0 and int(rqs_ai) <= 10:
                return type_ai, rqs_ai, rationale_ai
            else:
                raise ValueError("Response type or score out of expected range.")
        except (json.JSONDecodeError, KeyError, ValueError, TypeError, Exception) as e:
            print("\033[91mRetry Reason: {}\033[0m".format(str(e)))  # 红色字体提示重试原因
            attempt += 1
    return "\033[91mFailed to obtain valid response\033[0m", 0, "No valid response after {} attempts".format(max_attempts)

def process_turns(file_path, output_path):

    # Open and read the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Iterate over each object in the file
    for entry in tqdm(data):
        # Check if the 'turns' key exists
        print("__________________")
        if 'turns' in entry:
            turns = entry['turns']
            db_name = entry['db_name']
            length = len(turns)
            # Iterate over each turn in 'turns'
            for i in range(length):
                # Check if the current turn is a user turn
                if turns[i].get('isuser', False):
                    print(i//2)
                    # Output the current user's turn
                    print(turns[i].get('type', ''), "--", turns[i].get('text', ''))
                    # Check and process the next turn (if it exists)
                    if i + 1 < length:
                        next_turn = turns[i + 1]
                        predict_text = next_turn.get('predict', '')
                        # Find the positions of SELECT and the semicolon
                        select_pos = predict_text.upper().find('SELECT')
                        colon_pos = predict_text.find(';', select_pos)
                        if select_pos != -1 and colon_pos != -1:
                            predict_sql = predict_text[select_pos:colon_pos].replace('\n',' ')
                        elif select_pos != -1:
                            predict_sql = predict_text[select_pos:].replace('\n',' ')
                        else:
                            predict_sql = ""
                        # Store the result in a new field 'predict_sql'
                        next_turn['predict_sql'] = predict_sql
                        # Calculate the ratio of the extracted SQL to the entire predict field
                        if len(predict_text) == 0:
                            ratio = 0
                        else:
                            ratio = len(predict_sql) / len(predict_text)
                        if predict_sql != "" and ratio >= 0.5:
                                next_turn['predict_type'] = 'answerable'
                                if turns[i].get('type', '') == 'answerable':
                                    next_turn['RQS'] = "N/A"
                                else:
                                    next_turn['RQS'] = 0
                        else:
                            next_turn['predict_type'] = 'not answerable'
                            # Ask LLM, Get categorized and RQS scored based on database, questions, answers, gold answer
                            type_ai, rqs_ai, rationale_ai = ask_ai(db_name,turns[i].get('text', ''),predict_text,next_turn.get('text', '')) 
                            next_turn['predict_type'] = type_ai
                            next_turn['RQS'] = rqs_ai
                            next_turn['RQS_Rationale'] = rationale_ai
                        print("Next Turn predict_sql:", predict_sql)
                        print("Predict Type:", next_turn['predict_type'])
                    else:
                        print("Next Turn does not exist.")
    
    # Save the modified data to a new JSON file
    with open(output_path, 'w') as outfile:
        json.dump(data, outfile, indent=4)

# Example usage
input_file_path = 'outputs/gemini-1-Copy1.5-flash-llm.json'
output_file_path = 'outputs/rqs_gemini-1-Copy1.5-flash-llm.json.json'
process_turns(input_file_path, output_file_path)


  0%|                                                                                                                                                                                                                                                                             | 0/148 [00:00<?, ?it/s]

__________________
0
answerable -- How many students in the "Student" table are over the age of 20?
Next Turn predict_sql: SELECT COUNT(*) FROM Student WHERE Age > 20
Predict Type: answerable
1
answerable -- How many of those students own pets?
Next Turn predict_sql: SELECT COUNT(DISTINCT S.StuID) FROM Student AS S JOIN Has_Pet AS HP ON S.StuID = HP.StuID WHERE S.Age > 20
Predict Type: answerable
__________________
0
answerable -- Can you list all the continents?
Next Turn predict_sql: SELECT Continent FROM continents
Predict Type: answerable
1
answerable -- Okay, now can you list the number of countries belonging to each continent, including the continent id as well?
Next Turn predict_sql: SELECT c.ContId, c.Continent, COUNT(DISTINCT co.CountryId) AS NumberOfCountries FROM continents c JOIN countries co ON c.ContId = co.Continent GROUP BY c.ContId, c.Continent ORDER BY c.ContId
Predict Type: answerable
2
improper -- Thanks!
request gpt-4-1106-preview


  1%|███▌                                                                                                                                                                                                                                                                 | 2/148 [00:05<06:36,  2.71s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' does not contain a question or request for information, it is simply a polite expression of gratitude. Therefore, the system's response 'You're welcome! Let me know if you have any other questions.' is appropriate and matches the social convention of responding to thanks. Since the user's input does not require a database query or any specific information related to the database, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply in this context.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
unanswerable -- How many customers are there now?
Next Turn predict_sql: SELECT COUNT(*) FROM visitor
Predict Type: answerable
1
answerable -- How many visitors are ther?
Next Turn predict_sql: SELECT COUNT(DISTINCT ID) FROM visitor
Predict Type: answerable
2
answerable -- How many of them have membership levels higher than 4?
Next Tur

  2%|█████▎                                                                                                                                                                                                                                                               | 3/148 [00:09<07:52,  3.26s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' typically indicates that the conversation is concluding and does not contain a question or request for further information. The system's response 'You're welcome! Let me know if you have any other questions.' is appropriate for a conversational closing and does not require any database-related information or action. Therefore, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply as the user did not pose a question related to the database. The system's response is polite and offers assistance for any future inquiries, which is suitable for the context.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
ambiguous -- Which cause has the least enrollment?
Next Turn predict_sql: SELECT c.course_name FROM Courses c JOIN Student_Enrolment_Courses sec ON c.course_id = sec.course_id GROUP BY c.course_name ORDER BY COUNT(sec.student_course

  3%|███████                                                                                                                                                                                                                                                              | 4/148 [00:13<08:47,  3.67s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' does not contain a question or request for information, it is simply an expression of gratitude. Therefore, the system's response 'You're welcome! Let me know if you have any other questions. 😊' is appropriate and polite, matching the social context of the conversation. Since there is no question asked, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply as they are meant to evaluate responses to informational queries or tasks.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
answerable -- Can you show the birth date of player id 200002?
Next Turn predict_sql: SELECT birth_date FROM players WHERE player_id = 200002
Predict Type: answerable
1
answerable -- What is the first name of player id 200001?
Next Turn predict_sql: SELECT first_name FROM players WHERE player_id = 200001
Predict Type: answerable
2
answerable -- Can you sh

  3%|████████▊                                                                                                                                                                                                                                                            | 5/148 [00:17<08:54,  3.73s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' does not contain a question or request for information, it is simply a polite expression of gratitude. The system's response 'You're welcome! Let me know if you have any other questions. 😊' is a courteous acknowledgment and an invitation for further interaction. Since there is no question asked, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply as they are meant to evaluate responses to questions or requests for information.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
answerable -- What is the average age of the losers from all matches?
request gpt-4-1106-preview
LLM Response: {
  "AnswerType": "unanswerable",
  "Rationale": "The system did not provide any response to the user's question. The user asked for the average age of the losers from all matches, which would require calculating the average based on the 'loser_ag

  4%|██████████▌                                                                                                                                                                                                                                                          | 6/148 [00:25<12:24,  5.24s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' does not contain a question or request for information. It is a polite expression of gratitude, typically signaling the end of a conversation. The system's response 'You're welcome! Let me know if you have any other questions. 😊' is appropriate and polite, acknowledging the user's thanks and offering further assistance if needed. Since there is no question asked, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply as they are meant to evaluate responses to questions or requests for information.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
answerable -- which models are lighter than 3500?
Next Turn predict_sql: SELECT DISTINCT   Model FROM car_names AS cn JOIN cars_data AS cd   ON cn.MakeId = cd.Id WHERE   cd.Weight < 3500
Predict Type: answerable
1
answerable -- what is the weight of  BMW?
Next Turn predict_sql: SELECT T3.

  6%|███████████████▊                                                                                                                                                                                                                                                     | 9/148 [00:29<06:48,  2.94s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Great! I appreciate your help. Thanks!' does not contain a question or request for further information. It is a statement of gratitude and closure to the conversation. The system's response is a polite acknowledgment and an offer for further assistance, which is appropriate in a conversational context. Therefore, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply as there is no question being asked that requires a factual or informative response.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
answerable -- Show the name and the release year of the song by the youngest singer
Next Turn predict_sql: SELECT Name, Song_release_year FROM singer ORDER BY Age LIMIT 1
Predict Type: answerable
1
answerable -- Ok. Can you show me the name and the release year of the song by the oldest singer
Next Turn predict_sql: SELECT song_name, song_relea

  7%|█████████████████▌                                                                                                                                                                                                                                                  | 10/148 [00:34<07:41,  3.34s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' does not contain a question or request for information, it is a polite expression of gratitude. The system's response 'You're welcome! Let me know if you have any other questions. 😊' is a courteous acknowledgment of the user's thanks and an offer to assist further, which is appropriate in a conversational context. The reference answer 'goodbye!' is also a valid conversational closure but does not directly address the user's expression of thanks. Since the user's message does not require a database-related response or any specific action, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply in this context.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
answerable -- Which country tv channels some cartoon written by todd casey?
Next Turn predict_sql: SELECT DISTINCT T1.Country FROM TV_Channel AS T1 JOIN Cartoon AS T2 ON T1.id 

  7%|███████████████████▎                                                                                                                                                                                                                                                | 11/148 [00:40<09:00,  3.95s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' typically indicates that the conversation is concluding or that the user is expressing gratitude, rather than asking a question or requesting further information. The system's response 'You're welcome! Let me know if you have any other questions. 😊' is appropriate for a conversational AI that is not limited to SQL-based interactions. Since the user did not provide a query or context that required an SQL-based response or further clarification, the system's response is proper in a general conversational context but improper for an SQL-based query system. Therefore, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply as the user did not pose a question that required an SQL-based answer or any detailed response.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
ambiguous -- What is the weight of id 1?
Next Turn predict_sql: SELECT

  8%|█████████████████████                                                                                                                                                                                                                                               | 12/148 [00:44<08:51,  3.91s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' does not contain a question or request for information, it is a polite expression of gratitude. Therefore, the system's response is appropriate as it acknowledges the user's thanks and offers further assistance. The criteria of relevance, clarity, completeness, accuracy, and utility do not apply in this context because there is no question being asked that requires a detailed or informative response.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
answerable -- What are the codes of the countries that do not speak English?
Next Turn predict_sql: SELECT DISTINCT CountryCode FROM countrylanguage WHERE Language != 'English' ```
Predict Type: answerable
1
answerable -- What are the codes of the countries that do not speak English whose government forms are not Republic?
request gpt-4-1106-preview
LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The sys

  9%|██████████████████████▊                                                                                                                                                                                                                                             | 13/148 [00:53<11:40,  5.19s/it]

LLM Response: {
  "AnswerType": "improper",
  "Rationale": "The user's message 'Thanks!' does not contain a question or request for information, it is a polite closing to a conversation. The system's response 'You're welcome! Let me know if you have any other questions. 😊' is appropriate and polite, matching the social context of the user's message. Since there is no question asked, the criteria for relevance, clarity, completeness, accuracy, and utility do not apply in this context. The system's response is a standard closing interaction and does not require any action or information related to the database schema provided.",
  "Score": "10"
}
Next Turn predict_sql: 
Predict Type: improper
__________________
0
unanswerable -- What is the most popular car color?
request gpt-4-1106-preview


  9%|██████████████████████▊                                                                                                                                                                                                                                             | 13/148 [01:00<10:23,  4.62s/it]

LLM Response: {
  "AnswerType": "unanswerable",
  "Rationale": {
    "Relevance": "The response is fully relevant as it correctly identifies that the database does not contain information about car colors, which is what the user's question pertains to.",
    "Clarity": "The response is very clear and easy to understand. It directly states that the database does not contain the requested information.",
    "Completeness": "The response is complete in the context of the information available in the database. It cannot provide information that does not exist within the given schema.",
    "Accuracy": "The response is completely accurate as it truthfully reflects the absence of data regarding car colors in the database.",
    "Utility": "The response meets the user's needs to the extent possible by explaining that the question cannot be answered with the current database schema."
  },
  "Score": {
    "Relevance": 2,
    "Clarity": 2,
    "Completeness": 2,
    "Accuracy": 2,
    "Utility"




TypeError: int() argument must be a string, a bytes-like object or a real number, not 'dict'