Installing and importing libraries

In [None]:
%pip install -U langchain-aws

In [None]:
%pip install tqdm
%pip install pandas

In [None]:
from langchain_aws import ChatBedrock
from langchain_core.prompts import ChatPromptTemplate

In [None]:

import json
import time
import os
import re
from tqdm import tqdm
import yaml

In [None]:
#AWS API Keys
with open("/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/secrets.yml", 'r') as file:
    credentials = yaml.safe_load(file)

Initializing LLMs (Llama via Bedrock)

In [None]:
# Initialize ChatBedrock with LLaMA 3.3 70B
llm = ChatBedrock(
    model_id="us.meta.llama3-3-70b-instruct-v1:0",
    region="us-east-1",
    aws_access_key_id=credentials["bedrock"]["access_key"],
    aws_secret_access_key=credentials["bedrock"]["secret_key"],
    model_kwargs={
        "temperature": 0,
        "top_p": 1
    }
)

In [None]:
# Initialize ChatBedrock with LLaMA 3.2 3B
llm = ChatBedrock(
    model_id="us.meta.llama3-2-3b-instruct-v1:0",
    region="us-east-1",
    aws_access_key_id=credentials["bedrock"]["access_key"],
    aws_secret_access_key=credentials["bedrock"]["secret_key"],
    model_kwargs={
        "temperature": 0,
        "top_p": 1
    }
)

In [None]:
# Initialize ChatBedrock with LLaMA 3.1 8B
llm = ChatBedrock(
    model_id="us.meta.llama3-1-8b-instruct-v1:0",
    region="us-east-1",
    aws_access_key_id=credentials["bedrock"]["access_key"],
    aws_secret_access_key=credentials["bedrock"]["secret_key"],
    model_kwargs={
        "temperature": 0,
        "top_p": 1
    }
)

Baseline Prompt Section

In [None]:
# Load dataset
data_path = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/Final Files/Rubrics Reasoning/QREF/qref_final_reasoning.json"
output_file = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/Final Files/Rubrics Reasoning/QREF/llama_3_3_rubrics_3.json"

with open(data_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


In [None]:
# Define the prompt template
def create_prompt(system_message, user_message):
    return f"{system_message}\n\n{user_message}"

BASELINE PROMPT (DNA+CoT)

In [None]:
system_message = """You are a search quality rater evaluating the usefulness of clicked documents on a web page clicked by users in a session. You have been given the following session information:
  a.  User: User ID
  b.  Task ID: Unique ID for each task.
  c.  Task Description: A clear explanation of what the user tries to accomplish using the search engine.
  d.  Query: User submitted query.
  e.  ID: Unique ID for each row.
  f.  Clicked Documents: User clicked URL.
  g.  Title: Title of the clicked documents.
  h.  Summary: Brief description of the clicked documents on the web page.
  i.  Rank: Rank of clicked documents on the SERP labeled as 1(high)~30(low).
  j.  Task Relevance: Relevance of clicked documents to the user’s task labeled as 0 (low)~3 (high).
  k.  Query Relevance: Relevance of clicked documents to the user’s query labeled as 0 (low)~3 (high).
  l.  Task Satisfaction: User’s satisfaction on overall task labeled as 1(low)~5(high).
  m.  Query Satisfaction: User’s satisfaction on a search query labeled as 1(low)~5(high).
  n.  CTR: The percentage of clicks a document (URL) receives out of total interactions within a specific task and query.
  o.  URL Dwell Time (ms): Time spent on a clicked document.
  p.  Task Dwell Time (ms): Time spent on a specific task across all queries and clicked documents.
  q.  Query Dwell Time (ms): Time spent on a specific query.


You must analyze each task session by considering all the information given above and for each clicked document (URL), provide a usefulness score on an integer scale of 0 to 3 with the following meanings:
3 = Very Useful, very helpful for this query
2 = Fairly Useful, fairly helpful for this query
1 = Somewhat Useful, maybe partly helpful but might contain other irrelevant content
0 = Not Useful at all, should never be shown for this query

Important Instructions:
Consider all the attributes above while deciding on a usefulness score. If certain attributes are unavailable, rely on the available ones to decide. Assign category 1 if the
clicked document is somewhat useful to the task and query but not completely, category 2 if the clicked document presents something very important related to the task and
query but also has some extra information, and category 3 if the clicked document only and entirely refers to the task and document. If none of the above satisfies give it category 0.

You will be provided data in the form of:
[
  {
    "user": "<USER>",
    "task_id": "<TASK_ID>",
    "task": "<TASK_DESCRIPTION>",
    "task_dwell_time": "<TASK_DWELL_TIME>",
    "task_sat_score": "<TASK_SAT_SCORE>",
    "trs_query": "<QUERY_TEXT>",
    "query_dwell_time": "<QUERY_DWELL_TIME>",
    "query_sat_score": "<QUERY_SAT_SCORE>",
    "id": "<ID>"
    "url": "<URL>",
    "title_en": "<TITLE>",
    "summary_en": "<SUMMARY>",
    "rank": "<RANK>",
    "task_relevance": "<TASK_RELEVANCE>",
    "query_relevance": "<QUERY_RELEVANCE>",
    "url_dwell_time": "<URL_DWELL_TIME>",
    "CTR": "<CLICK_THROUGH_RATE>",
  }
]

For each user <USER> and each task session <TASK_ID>, for each clicked document <URL>, split this problem into steps:
a. Consider ALL the attributes and relative importance of each and decide on a final score <usefulness_i>. Final score must be an integer value only.
b. Prioritise ALL user metrics like CTR, URL dwell time, query dwell time and session dwell time as indicators of usefulness.
c. Consider user's intent of the search and task session ensuring they align with the title and summary.
d. Consider the rank of URLs relation to the user’s query and task description.
e. Consider task relevance and query relevance which indicate how well the result aligns with the user’s overall goal.
f. Consider user’s query satisfaction score and session satisfaction score, which reflect their satisfaction with the result.


ONLY PROVIDE OUTPUT IN GIVEN FORMAT. Directly output the usefulness score for unique ID <ID> as an integer value in the following JSON format:
{\"<id_1>\": <usefulness_1>, ..., \"<id_n>\": <usefulness_n>}

DO NOT PROVIDE ADDITIONAL TEXT, REASONING, EXAMPLE, OR CODE. GENERATE USEFULNESS SCORE IN CORRECT OUTPUT FORMAT FOR ALL UNIQUE ID.
"""

In [None]:
# Function to create batched input
def create_batched_input(batch):
    """
    Combine multiple rows into a single user message for batch processing.
    """
    batched_user_message = "\n\n".join(
        f"""
        id: {row['id']}
        user: {row['user']}
        task_id: {row['task_id']}
        task: {row['task']}
        task_sat_score: {row['task_sat_score']}
        task_dwell_time: {row['task_dwell_time']}
        trs_query: {row['trs_query']}
        query_position: {row['query_position']}
        query_sat_score: {row['query_sat_score']}
        query_dwell_time: {row['query_dwell_time']}
        url: {row['url']}
        title: {row['title_en']}
        summary: {row['summary_en']}
        rank: {row['rank']}
        task_relevance: {row['task_relevance']}
        query_relevance: {row['query_relevance']}
        url_dwell_time: {row['url_dwell_time']}
        avg_url_dwell_time: {row['avg_url_dwell_time']}
        CTR: {row['CTR']}
        """
        for row in batch
    )
    return batched_user_message.strip()


In [None]:
# Function to recover JSON from a truncated response
def recover_json(truncated_response):
    """
    Attempt to recover valid JSON from a truncated response.
    """
    try:
        # Remove backticks and extra formatting
        truncated_response = truncated_response.strip('```json').strip('```').strip()
        
        # Attempt to parse progressively shorter substrings of the response
        for i in range(len(truncated_response), 0, -1):
            try:
                return json.loads(truncated_response[:i])  
            except json.JSONDecodeError:
                continue 
    except Exception as e:
        print(f"Error recovering JSON: {e}")
    return None  

In [None]:
def process_batch_with_retry(batch, retries=3, backoff_factor=2):
    """
    Process a batch of rows by combining them into a single request, with backtick removal and truncated response handling.
    """
    for attempt in range(retries):
        try:
            # Combine rows into a single input
            batched_user_message = create_batched_input(batch)
            full_prompt = create_prompt(system_message, batched_user_message)

            # Send the combined prompt to the LLM
            response = llm.predict(full_prompt)
            
            # Print the log raw response
            print(f"Raw LLM response:\n{response}")

            # Clean the response by removing backticks
            cleaned_response = response.strip('```json').strip('```').strip()

            # Check if the response is truncated
            if not cleaned_response.endswith("}"):
                print("Warning: Truncated response detected.")
                print(f"Truncated response: {cleaned_response}")  # Log for debugging

                # Save truncated responses for manual review
                with open("truncated_responses.json", "a", encoding="utf-8") as file:
                    file.write(cleaned_response + "\n")

                # Attempt to recover JSON
                recovered_response = recover_json(cleaned_response)
                if recovered_response:
                    print("Recovered partial JSON response successfully.")
                    return recovered_response

                # Retry with smaller batch sizes if recovery fails
                print("Unable to recover truncated response. Retrying with smaller batch size.")
                smaller_batches = [batch[j:j+len(batch)//2] for j in range(0, len(batch), len(batch)//2)]
                for small_batch in smaller_batches:
                    process_batch_with_retry(small_batch)
                return {}

            # Parse the cleaned response as JSON
            parsed_response = json.loads(cleaned_response)
            print(f"Parsed response: {parsed_response}")  # Log parsed response for debugging
            return parsed_response

        except Exception as e:
            if "Rate limit" in str(e) and attempt < retries - 1:
                wait_time = min(backoff_factor ** attempt, 30)
                print(f"Rate limit hit. Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{retries})")
                time.sleep(wait_time)
            else:
                print(f"Failed after {attempt + 1} attempts. Error: {e}")
                raise e


In [None]:
# Check for partial results
if os.path.exists(output_file):
    with open(output_file, 'r', encoding='utf-8') as file:
        output = json.load(file)  # Load already processed results
else:
    output = {}

# Skip already processed rows
processed_ids = set(output.keys())  # Extract processed unique IDs
remaining_data = [row for row in data if str(row['id']) not in processed_ids]  # Filter remaining rows

# Batch size and processing configuration
batch_size = 1  # Adjust as needed
num_threads = 1  # Use single-threaded processing to minimize rate limits

# Progress bar setup
total_batches = (len(remaining_data) + batch_size - 1) // batch_size
progress = tqdm(total=total_batches, desc="Processing Remaining Batches", unit="batch")

start_time = time.time()

for i in range(0, len(remaining_data), batch_size):
    batch = remaining_data[i:i + batch_size]
    try:
        # Log batch details
        print(f"Processing batch {i // batch_size + 1}/{total_batches} with {len(batch)} rows.")
        #print(f"Batched input:\n{create_batched_input(batch)}")  # Debug log for input

        # Process the batch
        batch_output = process_batch_with_retry(batch)
        
        # Log raw output
        print(f"Batch Output:\n{batch_output}")

        # Update the results dictionary
        output.update(batch_output)

        # Save intermediate results
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)
        print(f"Intermediate results saved. Total entries: {len(output)}")

    except ValueError as ve:
        print(f"Truncated response detected in batch {i // batch_size + 1}: {ve}")
        # Save partial results even if the batch fails
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)
        print(f"Partial results saved after error. Total entries: {len(output)}")

    except Exception as e:
        print(f"Error processing batch {i // batch_size + 1}: {e}")
        # Save partial results for unexpected errors
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)
        print(f"Partial results saved after unexpected error. Total entries: {len(output)}")

    progress.update(1)

progress.close()
end_time = time.time()

# Final save
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(output, file, ensure_ascii=False, indent=4)

print(f"Experiment completed. Scores saved to '{output_file}'.")
print(f"Total time taken: {(end_time - start_time) / 60:.2f} minutes")


Session Prompt Section

In [None]:
# File paths
data_path = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/Final Files/filtered_df_1.json"
output_file = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/Results/Round 5/llama_3.2_3B_1.json"

with open(data_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


In [None]:
# Function for the full prompt
def create_prompt(system_message, user_message):
    return f"{system_message}\n\n{user_message}"

SESSION PROMPT (DNA+CoT+Personalization)

In [None]:
# Session prompt template
system_message = """You are a search quality rater evaluating the usefulness of clicked documents on a web page. You have been given the following information:
  a.  User: User ID
  b.  Task ID: Unique ID for each task.
  c.  Task Description: A clear explanation of what the user tries to accomplish using the search engine.
  d.  Query: User submitted query.
  e.  ID: Unique ID for each row.
  f.  Clicked Documents: User clicked URL.
  g.  Title: Title of the clicked documents.
  h.  Summary: Brief description of the clicked documents on the web page.
  i.  Rank: The rank of clicked documents on the result list.
  j.  Task Relevance: Relevance of clicked documents to the user’s task labeled as 0 (low)~3 (high).
  k.  Query Relevance: Relevance of clicked documents to the user’s query labeled as 0 (low)~3 (high).
  l.  Task Satisfaction: User’s satisfaction on overall task labeled as 1(low)~5(high).
  m.  Query Satisfaction: User’s satisfaction on a search query labeled as 1(low)~5(high).
  n.  URL Click-Through Rate (CTR): The percentage of clicks a document (URL) receives out of total interactions within a specific task and query.
  o.  URL Click Dwell Time: Time spent on a clicked document.
  p.  Task Dwell Time: Time spent on a specific task across all queries and clicked documents.
  q.  Query Dwell Time: Time spent on a specific query.

You must analyze each task session by considering all the information given above and for each clicked document (URL), provide a usefulness score on an integer scale of 0 to 3 with the following meanings:
3 = Very Useful, very helpful for this query
2 = Fairly Useful, fairly helpful for this query
1 = Somewhat Useful, maybe partly helpful but might contain other irrelevant content
0 = Not Useful at all, should never be shown for this query

Important Instructions:
Consider all the attributes above while deciding on a usefulness score. If certain attributes are unavailable, rely on the available ones to decide. Assign category 1 if the
clicked document is somewhat useful to the task and query but not completely, category 2 if the clicked document presents something very important related to the task and
query but also has some extra information, and category 3 if the clicked document only and entirely refers to the task and document. If none of the above satisfies give it category 0.

You will be provided data in the form of:

{
  "instruction": "Analyze each task session <TASK_ID> by each user <USER> and provide usefulness labels for each clicked document <URL>.",
  "data": {
    "user": "<USER>",
    "tasks": [
      {
        "task_id": "<TASK_ID>",
        "task": "<TASK_DESCRIPTION>",
        "task_dwell_time": "<TASK_DWELL_TIME>",
        "task_sat_score": "<TASK_SAT_SCORE>",
        "queries": [
          {
            "trs_query": "<QUERY_TEXT>",
            "query_dwell_time": "<QUERY_DWELL_TIME>",
            "query_sat_score": "<QUERY_SAT_SCORE>",
            "urls": [
              {
                "id": "<ID>",
                "url": "<URL>",
                "title_en": "<TITLE>",
                "summary_en": "<SUMMARY>",
                "rank": "<RANK>",
                "task_relevance": "<TASK_RELEVANCE>",
                "query_relevance": "<QUERY_RELEVANCE>",
                "CTR": "<CLICK_THROUGH_RATE>",
                "url_dwell_time": "<URL_DWELL_TIME>"
              }
            ]
          }
        ]
      }
    ]
  }
}


For each user <USER> and each task session <TASK_ID>, for each clicked document <URL>, split this problem into steps:
a. Consider the underlying intent of the search and task session.
b. Consider the title and summary ensuring they align with the user’s intent.
c. Consider the rank of the result in relation to the user’s query and task description.
d. Consider task relevance and query relevance which indicate how well the result aligns with the user’s overall goal.
e. Consider user’s query satisfaction score and session satisfaction score, which reflect their satisfaction with the result.
f. Consider user search behavior metrics like CTR, URL click dwell time, query dwell time, and session dwell time as indicators of usefulness.
g. Consider all the attributes above and relative importance of each and decide on a final score <usefulness_i>. Final score must be an integer value only.

Directly output the usefulness score for each unique row <ID> as an integer value in the following JSON format:
{\"<id_1>\": <usefulness_1>, ..., \"<id_n>\": <usefulness_n>}.
DO NOT PROVIDE ADDITIONAL REASONING< EXAMPLES OR CODE. Just provide the output.
"""

In [None]:
#Function to get the session input
def create_task_input(task_session):
    """
    Generate a structured task input string for LLaMA 3.3 70B based on the provided JSON structure.
    """
    task_context = f"User: {task_session.get('user', 'Unknown')}\nTasks:\n"

    for task in task_session.get('tasks', []):
        task_context += (
            f"  - Task ID: {task.get('task_id', 'Unknown')}\n"
            f"    Task Description: {task.get('task', 'Unknown')}\n"
            f"    Task Dwell Time: {task.get('task_dwell_time', 'Unknown')}\n"
            f"    Task Satisfaction Score: {task.get('task_sat_score', 'Unknown')}\n"
            f"    Queries:\n"
        )

        for query in task.get('queries', []):
            task_context += (
                f"      - Query: {query.get('trs_query', 'Unknown')}\n"
                f"        Query Position: {query.get('query_position', 'Unknown')}\n"
                f"        Query Dwell Time: {query.get('query_dwell_time', 'Unknown')}\n"
                f"        Query Satisfaction Score: {query.get('query_sat_score', 'Unknown')}\n"
                f"        Clicked URLs:\n"
            )

            for url in query.get('urls', []):
                task_context += (
                    f"          - ID: {url.get('id', 'Unknown')}\n"
                    f"            URL: {url.get('url', 'Unknown')}\n"
                    f"            Title: {url.get('title_en', 'Unknown')}\n"
                    f"            Summary: {url.get('summary_en', 'Unknown')}\n"
                    f"            Rank: {url.get('rank', 'Unknown')}\n"
                    f"            Task Relevance: {url.get('task_relevance', 'Unknown')}\n"
                    f"            Query Relevance: {url.get('query_relevance', 'Unknown')}\n"
                    f"            URL Dwell Time: {url.get('url_dwell_time', 'Unknown')}\n"
                    f"            Avg URL Dwell Time Per User Per Task: {url.get('avg_url_dwell_time_per_user_task', 'Unknown')}\n"
                    f"            Avg URL Dwell Time Per User: {url.get('avg_url_dwell_time_per_user', 'Unknown')}\n"
                    f"            Click-Through Rate (CTR): {url.get('CTR', 'Unknown')}\n"
                    f"            Total Queries Per User Per Task: {url.get('total_queries', 'Unknown')}\n"
                    f"            Max Clicks Per User Per Query: {url.get('max_clicks_per_user_query', 'Unknown')}\n"
                )

    return task_context.strip()

In [None]:
# Function to fix JSON formatting before parsing
def fix_json_format(response_content):
    """
    Cleans up LLM output to remove markdown formatting and ensures valid JSON structure.
    """
    response_content = response_content.strip()

    # Remove ALL leading/trailing triple backticks (```json ... ```)
    response_content = re.sub(r"^```[a-zA-Z]*", "", response_content).strip()  # Remove leading ```json or ```
    response_content = re.sub(r"```$", "", response_content).strip()  # Remove trailing ```

    # Fix missing commas between JSON objects
    response_content = re.sub(r"}\s*{", "}, {", response_content)

    # Ensure JSON starts and ends correctly
    if not response_content.startswith("{"):
        response_content = "{" + response_content.lstrip('{')
    if not response_content.endswith("}"):
        response_content = response_content.rstrip('}') + "}"

    return response_content

In [None]:
#Main function with missed entry checks
def process_task_with_retry(task, llm, system_message, retries=3, backoff_factor=2, max_wait=30):
    """
    Processes a single task with retry logic, handling truncation and rate limits.
    """
    for attempt in range(retries):
        try:
            task_input = create_task_input(task)
            full_prompt = create_prompt(system_message, task_input)

            # Invoke the LLM
            response = llm.invoke(full_prompt)

            # Extract the text responses
            response_content = getattr(response, "content", "").strip()
            if not response_content:
                print(f"Warning: Empty LLM response for Task ID {task.get('task_id', 'Unknown')}")
                continue  # Retry

            # Print raw response in log for debugging
            print(f"\n🔍 Raw LLM Response for Task ID {task.get('task_id', 'Unknown')}:\n{response_content}\n")

            # Fix JSON formatting issues
            response_content = fix_json_format(response_content)

            # Attempt JSON parsing
            try:
                parsed_response = json.loads(response_content)

                # Ensure response is a dictionary `{ "id": usefulness }`
                if not isinstance(parsed_response, dict):
                    print(f"Warning: Unexpected JSON structure from LLM for Task ID {task.get('task_id', 'Unknown')}")
                    continue  # Retry

                # Filter IDs between 1 and 3050 and usefulness scores between 0 and 3
                filtered_response = {
                    str(url_id): usefulness
                    for url_id, usefulness in parsed_response.items()
                    if url_id.isdigit() and 1 <= int(url_id) <= 3050 and 0 <= usefulness <= 3
                }

                return filtered_response 

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for Task ID {task.get('task_id', 'Unknown')}: {e}")
                print("Skipping task due to JSON issues.")
                return None  # Skip this task instead of retrying

        except Exception as e:
            if "Rate limit" in str(e) and attempt < retries - 1:
                wait_time = min(backoff_factor ** attempt, max_wait)
                print(f"Rate limit hit. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Failed after {attempt + 1} attempts for Task ID {task.get('task_id', 'Unknown')}. Error: {e}")
                raise e

    print(f"Task ID {task.get('task_id', 'Unknown')} failed after {retries} attempts.")
    return None

# Load existing `{ "id": usefulness }` dictionary
if os.path.exists(output_file):
    with open(output_file, 'r', encoding='utf-8') as file:
        output = json.load(file)
else:
    output = {}  

#Track already processed IDs
processed_ids = set(output.keys())

# Find missing IDs between 1 and 3050
expected_ids = set(str(i) for i in range(1, 3051))  # Ensure string format matches stored IDs
missing_ids = expected_ids - processed_ids  # IDs that need reprocessing

if missing_ids:
    print(f"⚠️ {len(missing_ids)} IDs are missing. They will be reprocessed.")

# Extract remaining tasks (only for missing IDs)
remaining_tasks = []
for user_data in data:
    for task in user_data.get("tasks", []):
        for query in task.get("queries", []):
            for url_data in query.get("urls", []):
                url_id = str(url_data.get("id"))  # Ensure string format
                
                if url_id in missing_ids:  #Add only if it's missing
                    remaining_tasks.append({"tasks": [task]})
                    break  #Prevent adding the same task multiple times

# Progress bar setup
progress = tqdm(total=len(remaining_tasks), desc="Processing Tasks", unit="task")

start_time = time.time()

# Process each task
for task_session in remaining_tasks:
    try:
        task_id = task_session["tasks"][0]["task_id"]
        task_output = process_task_with_retry(task_session, llm, system_message)

        if task_output is None:
            print(f"Skipping Task {task_id} due to LLM failure.")
            progress.update(1)  
            continue

        for task in task_session["tasks"]:
            for query in task.get("queries", []):
                for url_data in query.get("urls", []):
                    url_id = str(url_data.get("id")) 

                    if not url_id or url_id in processed_ids:
                        continue

                    # Retrieve usefulness score
                    usefulness_score = task_output.get(url_id)

                    if usefulness_score is not None:
                        output[url_id] = usefulness_score  #Store in `{ "id": usefulness }` format
                        processed_ids.add(url_id)

        # Save progress
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)

    except ValueError as ve:
        print(f"Truncated response detected for Task {task_id}: {ve}")

    except Exception as e:
        print(f"Error processing Task {task_id}: {e}")

    progress.update(1)  

progress.close()
end_time = time.time()

# Final save
if output:
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(output, file, ensure_ascii=False, indent=4)

print(f"Experiment completed. Scores saved to '{output_file}'.")
print(f"Total time taken: {(end_time - start_time) / 60:.2f} minutes")