Installing and importing libraries

In [None]:
%pip install -U langchain-aws

In [None]:
from langchain_aws import ChatBedrock
from langchain_core.prompts import ChatPromptTemplate

import json
import time
import os
from tqdm import tqdm
import yaml

In [None]:
# Load credentials
with open("/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/secrets.yml", 'r') as file:
    credentials = yaml.safe_load(file)


Initializing LLMs (Llama via AWS Bedrock)

In [None]:
# Initialize ChatBedrock with LLaMA 3.3 70B
llm = ChatBedrock(
    model_id="us.meta.llama3-3-70b-instruct-v1:0",
    region="us-east-1",
    aws_access_key_id=credentials["bedrock"]["access_key"],
    aws_secret_access_key=credentials["bedrock"]["secret_key"],
    model_kwargs={
        "temperature": 0,
        "top_p": 1
    }
)

In [None]:
# Initialize ChatBedrock with LLaMA 3.2 3B
llm = ChatBedrock(
    model_id="us.meta.llama3-2-3b-instruct-v1:0",
    region="us-east-1",
    aws_access_key_id=credentials["bedrock"]["access_key"],
    aws_secret_access_key=credentials["bedrock"]["secret_key"],
    model_kwargs={
        "temperature": 0,
        "top_p": 1
    }
)

In [None]:
# Initialize ChatBedrock with LLaMA 3.1 8B
llm = ChatBedrock(
    model_id="us.meta.llama3-1-8b-instruct-v1:0",
    region="us-east-1",
    aws_access_key_id=credentials["bedrock"]["access_key"],
    aws_secret_access_key=credentials["bedrock"]["secret_key"],
    model_kwargs={
        "temperature": 0,
        "top_p": 1
    }
)

Baseline Prompt Section

In [None]:
# Load dataset
data_path = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/QREF/qref_session.json"
output_file = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/QREF/qref_session_llama_3.2_3b.json"

with open(data_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


In [None]:
# Define the prompt template
def create_prompt(system_message, user_message):
    return f"{system_message}\n\n{user_message}"

BASELINE PROMPT (DNA+CoT)

In [None]:
# Define the prompt template
system_message = """You are a search quality rater evaluating the usefulness of clicked documents on a web page clicked by users in a session. You have been given the following session information:
  a.  User: User ID
  b.  Session ID: Unique ID for each session.
  c.  Query ID: Unique ID for each query.
  d.  Query: User submitted query.
  e.  ID: Unique ID for each row.
  f.  Clicked Documents: User clicked URL.
  g.  Title: Title of the clicked documents.
  h.  Content: Brief description of the clicked documents on the web page.
  i.  Rank: Rank of clicked documents on the SERP labeled as 1(high)~10(low).
  j.  Task Satisfaction: User’s satisfaction on overall task labeled as 0(low)~4(high).
  k.  Query Satisfaction: User’s satisfaction on a search query labeled as 0(low)~4(high).
  l.  CTR: The percentage of clicks a document (URL) receives out of total interactions within a specific task and query.
  m.  Session Dwell Time (ms): Time spent on a specific task across all queries and clicked documents.
  n.  Query Dwell Time (ms): Time spent on a specific query.


You must analyze each user-session by considering all the information given above and for each clicked document (URL), provide a usefulness score on an integer scale of 0 to 3 with the following meanings:
3 = Very Useful, very helpful for this query
2 = Fairly Useful, fairly helpful for this query
1 = Somewhat Useful, maybe partly helpful but might contain other irrelevant content
0 = Not Useful at all, should never be shown for this query

Important Instructions:
Consider all the attributes above while deciding on a usefulness score. If certain attributes are unavailable, rely on the available ones to decide. Assign category 1 if the
clicked document is somewhat useful to the task and query but not completely, category 2 if the clicked document presents something very important related to the task and
query but also has some extra information, and category 3 if the clicked document only and entirely refers to the task and document. If none of the above satisfies give it category 0.

You will be provided data in the form of:
[
    {
        "user_id": "<USER>",
        "session_id": "<SESSION_ID>",
        "session_dwell_time": "<SESSION_DWELL_TIME>",
        "session_sat_score": "<SESSION_SAT_SCORE>",
        "query_id": "<QUERY_ID>",
        "query_string_en": "<QUERY_TEXT>",
        "query_dwell_time": "<QUERY_DWELL_TIME>",
        "query_sat_score": "<QUERY_SAT_SCORE>",
        "id": "<ID>",
        "url": "<URL>",
        "title_en": "<TITLE>",
        "content_en": "<CONTENT>",
        "rank": "<RANK>",
        "CTR": "<CLICK_THROUGH_RATE>",
      }
]

For each user <USER> and each task session <TASK_ID>, for each clicked document <URL>, split this problem into steps:
a. Consider ALL the attributes and relative importance of each and decide on a final score <usefulness_i>. Final score must be an integer value only.
b. Prioritise ALL user metrics like CTR, query dwell time and session dwell time as indicators of usefulness.
c. Consider user's intent of the query and session ensuring they align with the title and content.
d. Consider the rank of URLs relation to the user’s query.
e. Consider user’s query satisfaction score and session satisfaction score, which reflect their satisfaction with the result.


ONLY PROVIDE OUTPUT IN GIVEN FORMAT. Directly output the usefulness score for unique ID <ID> as an integer value in the following JSON format:
{\"<id_1>\": <usefulness_1>, ..., \"<id_n>\": <usefulness_n>}

GENERATE USEFULNESS SCORE IN CORRECT OUTPUT FORMAT FOR ALL UNIQUE ID. DO NOT PROVIDE ADDITIONAL TEXT, REASONING, EXAMPLE, OR CODE. 
"""

QREF BASELINE ROUND

In [None]:
# Function to create batched input
def create_batched_input(batch):
    return "\n\n".join(
        f"""
        user: {row['user_id']}
        session_id: {row['session_id']}
        session_dwell_time: {row['session_dwell_time']}
        session_sat_score: {row['session_sat_score']}
        query_id: {row['query_id']}
        query_string_en: {row['query_string_en']}
        query_dwell_time: {row['query_dwell_time']}
        query_sat_score: {row['query_sat_score']}
        id: {row['id']}
        url: {row['url']}
        title_en: {row['title_en']}
        content_en: {row['content_en']}
        rank: {row['rank']}
        CTR: {row['CTR']}
        """
        for row in batch
    ).strip()

In [None]:
# Function to recover JSON from a truncated response
def recover_json(truncated_response):
    try:
        truncated_response = truncated_response.strip('```json').strip('```').strip()
        for i in range(len(truncated_response), 0, -1):
            try:
                return json.loads(truncated_response[:i])
            except json.JSONDecodeError:
                continue
    except Exception as e:
        print(f"Error recovering JSON: {e}")
    return None


In [None]:
# Function to process a batch with retry and truncated response handling
def process_batch_with_retry(batch, retries=3, backoff_factor=2):
    for attempt in range(retries):
        try:
            batched_user_message = create_batched_input(batch)
            full_prompt = create_prompt(system_message, batched_user_message)
            # Send the combined prompt to the LLM
            response = llm.predict(full_prompt)  
            print(f"Raw LLM response:\n{response}") 

            cleaned_response = response.strip('```json').strip('```').strip() 

            if not cleaned_response.endswith("}"):
                print("Warning: Truncated response detected.")
                recovered_response = recover_json(cleaned_response)
                if recovered_response:
                    print("Recovered partial JSON response successfully.")
                    return recovered_response
                print("Unable to recover truncated response. Retrying with smaller batch size.")
                smaller_batches = [batch[j:j+len(batch)//2] for j in range(0, len(batch), len(batch)//2)]
                for small_batch in smaller_batches:
                    process_batch_with_retry(small_batch)
                return {}

            return json.loads(cleaned_response)

        except Exception as e:
            if "Rate limit" in str(e) and attempt < retries - 1:
                wait_time = min(backoff_factor ** attempt, 30)
                print(f"Rate limit hit. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Failed after {attempt + 1} attempts. Error: {e}")
                raise e


In [None]:
# Check for partial results
if os.path.exists(output_file):
    with open(output_file, 'r', encoding='utf-8') as file:
        output = json.load(file)
else:
    output = {}

# Skip already processed rows
processed_ids = set(output.keys())
remaining_data = [row for row in data if str(row['id']) not in processed_ids]

# Batch size and configuration
batch_size = 20
num_threads = 1

# Progress bar setup
total_batches = (len(remaining_data) + batch_size - 1) // batch_size
progress = tqdm(total=total_batches, desc="Processing Batches", unit="batch")

start_time = time.time()

for i in range(0, len(remaining_data), batch_size):
    batch = remaining_data[i:i + batch_size]
    try:
        batch_output = process_batch_with_retry(batch)
        output.update(batch_output)

        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)

    except ValueError as ve:
        print(f"Truncated response detected: {ve}")
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)

    except Exception as e:
        print(f"Error processing batch: {e}")
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)

    progress.update(1)

progress.close()
end_time = time.time()

with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(output, file, ensure_ascii=False, indent=4)

print(f"Experiment completed. Scores saved to '{output_file}'.")
print(f"Total time taken: {(end_time - start_time) / 60:.2f} minutes")


Session Prompt Section

In [None]:
import json
import time
import datetime
import os
import re  # Import regex for JSON cleaning
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage

In [None]:
# Define file paths
data_path = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/Final Files/filtered_df_1.json"
output_file = "/Users/mdewan/Documents/Research Projects/Usefulness IR'24/KDD19_UserStudy/Results/Round 5/llama_3.2_3B_1.json"

# Load the dataset from JSON file
with open(data_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


In [None]:
# Function to create the full prompt
def create_prompt(system_message, user_message):
    return f"{system_message}\n\n{user_message}"


SESSION PROMPT (DNA+CoT+Personalization)

In [None]:
# Define the prompt template
system_message = """You are a search quality rater evaluating the usefulness of clicked documents on a web page clicked by users in a session. You have been given the following session information:
  a.  User: User ID
  b.  Session ID: Unique ID for each session.
  c.  Query ID: Unique ID for each query.
  d.  Query: User submitted query.
  e.  ID: Unique ID for each row.
  f.  Clicked Documents: User clicked URL.
  g.  Title: Title of the clicked documents.
  h.  Content: Brief description of the clicked documents on the web page.
  i.  Rank: Rank of clicked documents on the SERP labeled as 1(high)~10(low).
  j.  Task Satisfaction: User’s satisfaction on overall task labeled as 0(low)~4(high).
  k.  Query Satisfaction: User’s satisfaction on a search query labeled as 0(low)~4(high).
  l.  CTR: The percentage of clicks a document (URL) receives out of total interactions within a specific task and query.
  m.  Session Dwell Time (ms): Time spent on a specific task across all queries and clicked documents.
  n.  Query Dwell Time (ms): Time spent on a specific query.


You must analyze each user-session by considering all the information given above and for each clicked document (URL), provide a usefulness score on an integer scale of 0 to 3 with the following meanings:
3 = Very Useful, very helpful for this query
2 = Fairly Useful, fairly helpful for this query
1 = Somewhat Useful, maybe partly helpful but might contain other irrelevant content
0 = Not Useful at all, should never be shown for this query

Important Instructions:
Consider all the attributes above while deciding on a usefulness score. If certain attributes are unavailable, rely on the available ones to decide. Assign category 1 if the
clicked document is somewhat useful to the task and query but not completely, category 2 if the clicked document presents something very important related to the task and
query but also has some extra information, and category 3 if the clicked document only and entirely refers to the task and document. If none of the above satisfies give it category 0.

You will be provided data in the form of:
[
    {
        "user_id": "<USER>",
        "session_id": "<SESSION_ID>",
        "session_dwell_time": "<SESSION_DWELL_TIME>",
        "session_sat_score": "<SESSION_SAT_SCORE>",
        "queries": [
          {
            "query_id": "<QUERY_ID>",
            "query_string_en": "<QUERY_TEXT>",
            "query_dwell_time": "<QUERY_DWELL_TIME>",
            "query_sat_score": "<QUERY_SAT_SCORE>",
            "urls": [
              {
                "id": "<ID>",
                "url": "<URL>",
                "title_en": "<TITLE>",
                "content_en": "<CONTENT>",
                "rank": "<RANK>",
                "CTR": "<CLICK_THROUGH_RATE>",
              }
            ]
          }
        ]
      }
] 

For each user <USER> and each task session <TASK_ID>, for each clicked document <URL>, split this problem into steps:
a. Consider ALL the attributes and relative importance of each and decide on a final score <usefulness_i>. Final score must be an integer value only.
b. Prioritise ALL user metrics like CTR, query dwell time and session dwell time as indicators of usefulness.
c. Consider user's intent of the query and session ensuring they align with the title and content.
d. Consider the rank of URLs relation to the user’s query.
e. Consider user’s query satisfaction score and session satisfaction score, which reflect their satisfaction with the result.


ONLY PROVIDE OUTPUT IN GIVEN FORMAT. Directly output the usefulness score for unique ID <ID> as an integer value in the following JSON format:
{\"<id_1>\": <usefulness_1>, ..., \"<id_n>\": <usefulness_n>}

GENERATE USEFULNESS SCORE IN CORRECT OUTPUT FORMAT FOR ALL UNIQUE ID. DO NOT PROVIDE ADDITIONAL TEXT, REASONING, EXAMPLE, OR CODE. 
"""

SESSION PROMPT ROUND

In [None]:
def create_task_input(task_session):
    """
    Generate a structured task input string for LLaMA 3.3 70B based on the provided JSON structure.
    """
    task_context = f"User: {task_session.get('user', 'Unknown')}\nTasks:\n"

    for task in task_session.get('tasks', []):
        task_context += (
            f"  - Task ID: {task.get('task_id', 'Unknown')}\n"
            f"    Task Description: {task.get('task', 'Unknown')}\n"
            f"    Task Dwell Time: {task.get('task_dwell_time', 'Unknown')}\n"
            f"    Task Satisfaction Score: {task.get('task_sat_score', 'Unknown')}\n"
            f"    Queries:\n"
        )

        for query in task.get('queries', []):
            task_context += (
                f"      - Query: {query.get('trs_query', 'Unknown')}\n"
                f"        Query Position: {query.get('query_position', 'Unknown')}\n"
                f"        Query Dwell Time: {query.get('query_dwell_time', 'Unknown')}\n"
                f"        Query Satisfaction Score: {query.get('query_sat_score', 'Unknown')}\n"
                f"        Clicked URLs:\n"
            )

            for url in query.get('urls', []):
                task_context += (
                    f"          - ID: {url.get('id', 'Unknown')}\n"
                    f"            URL: {url.get('url', 'Unknown')}\n"
                    f"            Title: {url.get('title_en', 'Unknown')}\n"
                    f"            Summary: {url.get('summary_en', 'Unknown')}\n"
                    f"            Rank: {url.get('rank', 'Unknown')}\n"
                    f"            Task Relevance: {url.get('task_relevance', 'Unknown')}\n"
                    f"            Query Relevance: {url.get('query_relevance', 'Unknown')}\n"
                    f"            URL Dwell Time: {url.get('url_dwell_time', 'Unknown')}\n"
                    f"            Click-Through Rate (CTR): {url.get('CTR', 'Unknown')}\n"
                
                )

    return task_context.strip()


In [None]:
# Function to fix JSON formatting before parsing
def fix_json_format(response_content):
    """
    Cleans up LLM output to remove markdown formatting and ensures valid JSON structure.
    """
    response_content = response_content.strip()

   #Remove ALL leading/trailing triple backticks (```json ... ```)
    response_content = re.sub(r"^```[a-zA-Z]*", "", response_content).strip()  # Remove leading ```json or ```
    response_content = re.sub(r"```$", "", response_content).strip()  # Remove trailing ```

    # Fix missing commas between JSON objects
    response_content = re.sub(r"}\s*{", "}, {", response_content)

    #Ensure JSON starts and ends correctly
    if not response_content.startswith("{"):
        response_content = "{" + response_content.lstrip('{')
    if not response_content.endswith("}"):
        response_content = response_content.rstrip('}') + "}"

    return response_content

In [None]:
def process_task_with_retry(task, llm, system_message, retries=3, backoff_factor=2, max_wait=30):
    """
    Processes a single task with retry logic, handling truncation and rate limits.
    """
    for attempt in range(retries):
        try:
            task_input = create_task_input(task)
            full_prompt = create_prompt(system_message, task_input)

            # Invoke the LLM
            response = llm.invoke(full_prompt)

            # Extract the text response safely
            response_content = getattr(response, "content", "").strip()
            if not response_content:
                print(f"Warning: Empty LLM response for Task ID {task.get('task_id', 'Unknown')}")
                continue

            # Log raw response for debugging
            print(f"\n🔍 Raw LLM Response for Task ID {task.get('task_id', 'Unknown')}:\n{response_content}\n")

            # Fix JSON formatting issues
            response_content = fix_json_format(response_content)

            # Attempt JSON parsing
            try:
                parsed_response = json.loads(response_content)

                # Ensure response is a dictionary `{ "id": usefulness }`
                if not isinstance(parsed_response, dict):
                    print(f"Warning: Unexpected JSON structure from LLM for Task ID {task.get('task_id', 'Unknown')}")
                    continue  # Retry

                return parsed_response 

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for Task ID {task.get('task_id', 'Unknown')}: {e}")
                print("Skipping task due to JSON issues.")
                return None  # Skip this task instead of retrying

        except Exception as e:
            if "Rate limit" in str(e) and attempt < retries - 1:
                wait_time = min(backoff_factor ** attempt, max_wait)
                print(f"⏳ Rate limit hit. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Failed after {attempt + 1} attempts for Task ID {task.get('task_id', 'Unknown')}. Error: {e}")
                raise e

    print(f"Task ID {task.get('task_id', 'Unknown')} failed after {retries} attempts.")
    return None


In [None]:
# Load existing `{ "id": usefulness }` dictionary
if os.path.exists(output_file):
    with open(output_file, 'r', encoding='utf-8') as file:
        output = json.load(file)
else:
    output = {} 

# Track already processed IDs
processed_ids = set(output.keys())

# Extract remaining tasks
remaining_tasks = []
for user_data in data:
    for task in user_data.get("tasks", []):
        all_urls = [url["id"] for query in task.get("queries", []) for url in query.get("urls", [])]
        unprocessed_urls = [url for url in all_urls if url not in processed_ids]

        if unprocessed_urls:  
            remaining_tasks.append({"tasks": [task]})

# Bar setup
progress = tqdm(total=len(remaining_tasks), desc="Processing Tasks", unit="task")

start_time = time.time()

# Process each task
for task_session in remaining_tasks:
    try:
        task_id = task_session["tasks"][0]["task_id"]
        task_output = process_task_with_retry(task_session, llm, system_message)

        if task_output is None:
            print(f"Skipping Task {task_id} due to LLM failure.")
            progress.update(1) 
            continue

        for task in task_session["tasks"]:
            for query in task.get("queries", []):
                for url_data in query.get("urls", []):
                    url_id = url_data.get("id")

                    if not url_id or url_id in processed_ids:
                        continue

                    usefulness_score = task_output.get(str(url_id))

                    if usefulness_score is not None:
                        output[str(url_id)] = usefulness_score  
                        processed_ids.add(url_id)


        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)

    except ValueError as ve:
        print(f"Truncated response detected for Task {task_id}: {ve}")

    except Exception as e:
        print(f"Error processing Task {task_id}: {e}")

   
    progress.update(1)  

progress.close()
end_time = time.time()

#Final save
if output:
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(output, file, ensure_ascii=False, indent=4)

print(f"Experiment completed. Scores saved to '{output_file}'.")
print(f"Total time taken: {(end_time - start_time) / 60:.2f} minutes")
