In [1]:
from pyserini.search.lucene import LuceneSearcher
import json

searcher = LuceneSearcher.from_prebuilt_index('msmarco-v1-passage')

Nov 05, 2024 4:54:37 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


# API KEY 记得删

In [2]:
import os
"""
API KEY FOR OPENAI
"""
API_KEY = ""
os.environ['OPENAI_API_KEY'] = API_KEY

import replicate

from replicate.client import Client
"""
API KEY FOR REPLICATE
"""
replicate_api = Client(api_token="")

"""
API KEY FOR GEMINI
"""
API_KEY_GEMINI = ""
import google.generativeai as genai
import os

genai.configure(api_key=API_KEY_GEMINI)

In [3]:
import pandas as pd

"""
# Path to the uploaded file
file_path = './data/2019qrels-pass.txt'

# Load the file into a DataFrame, assuming the separator is a space
df = pd.read_csv(file_path, sep=' ', header=None, names=['Topic', 'Q0', 'DocID', 'Relevance'])

# Drop the 'Q0' column as it's not needed for this analysis
df.drop('Q0', axis=1, inplace=True)

# Group by 'Topic' and 'Relevance' to get the count of each relevance score per topic
# Unstack to pivot the 'Relevance' values into separate columns, filling missing values with 0
relevance_distribution = df.groupby(['Topic', 'Relevance']).size().unstack(fill_value=0)

# Display the first few rows of the distribution DataFrame
# Calculate the proportion of each relevance score within each topic
relevance_proportions = relevance_distribution.div(relevance_distribution.sum(axis=1), axis=0)
"""

"\n# Path to the uploaded file\nfile_path = './data/2019qrels-pass.txt'\n\n# Load the file into a DataFrame, assuming the separator is a space\ndf = pd.read_csv(file_path, sep=' ', header=None, names=['Topic', 'Q0', 'DocID', 'Relevance'])\n\n# Drop the 'Q0' column as it's not needed for this analysis\ndf.drop('Q0', axis=1, inplace=True)\n\n# Group by 'Topic' and 'Relevance' to get the count of each relevance score per topic\n# Unstack to pivot the 'Relevance' values into separate columns, filling missing values with 0\nrelevance_distribution = df.groupby(['Topic', 'Relevance']).size().unstack(fill_value=0)\n\n# Display the first few rows of the distribution DataFrame\n# Calculate the proportion of each relevance score within each topic\nrelevance_proportions = relevance_distribution.div(relevance_distribution.sum(axis=1), axis=0)\n"

In [4]:
import numpy as np

In [5]:
ROLE_DESCRIPTION_HEAD = "\
Given a query and a passage, you must provide a score on an integer scale of 0 to 3 with the following meanings: \n\
0 = represent that the passage has nothing to do with the query, \n\
1 = represents that the passage seems related to the query but does not answer it, \n\
2 = represents that the passage has some answer for the query, but the answer may be a bit unclear, \
or hidden amongst extraneous information and \n\
3 = represents that the passage is dedicated to the query and contains the exact answer. \n\n\
Important Instruction: Assign category 1 if the passage is somewhat related to the topic but not completely, \
category 2 if passage presents something very important related to the entire topic but also has some extra information and \
category 3 if the passage only and entirely refers to the topic. If none of the above satisfies give it category 0.\n\n\
Next, I will provide you with a batch of documents in the form of {<id_1>: <content_1>, ..., <id_n>: <content_n>}. \
You need to assess their relevance scores and output the result. Your output MUST be in the format of a JSON string \
like {\"<id_1>\": <relevance_1>, ..., \"<id_n>\": <relevance_n>}.  Do NOT provide any explain or reasoning. \
"

In [6]:
ROLE_DESCRIPTION_END = "\n\
For each passage <passage_i>, split the problem into steps: \
Consider the underlying intent of the search. \n\
Measure how well the content matches a likely intent of the query (M). \n\
Decide on a final score <relevance_i>. Final score MUST be an integer value ONLY.\n\
Directly output the relevance score in the form of JSON, do NOT generate any other content.\
Your output MUST be in the form of {\"<id_1>\": <relevance_1>, ..., \"<id_n>\": <relevance_n>}. Do NOT provide any explain or reasoning.\n\n\
"

In [7]:
def get_doc_content(docid):
    # 尝试获取搜索器中的文档
    doc = searcher.doc(docid)
    if doc is None:
        # 如果文档为空，尝试从指定目录读取文本文件
        file_path = os.path.join(AUG_DOCS_DIR, docid)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                doc_content = file.read()
        except FileNotFoundError:
            print("No augmented doc file")
            doc_content = None
    else:
        # 如果文档不为空，从搜索器中提取内容
        json_doc = json.loads(doc.raw())
        doc_content = json_doc.get('contents', None)
    
    return doc_content
    

def create_rel_judge_prompt(query, example_doc_list):
    #print(len(example_list))
    question_prompt = "<question>" + query + "\n"
    example_prompt = "{"
    for i in range(len(example_doc_list)):
        example_doc = example_doc_list[i]
        docid = example_doc["docid"]
        try:
            doc_content = get_doc_content(docid)
            if i < len(example_doc_list) - 1:
                example_prompt += f"\"{docid}\": \"{doc_content}\",\n"
            else:
                example_prompt += f"\"{docid}\": \"{doc_content}\"\n"
        except Exception as e:
            print(e)
            print(docid)
    example_prompt += '}'
    return question_prompt + example_prompt + ROLE_DESCRIPTION_END

In [8]:
import openai
from openai import OpenAI
import os

In [9]:
def get_query_map(file_path, is_need_print=False):
    # Path to the file
    #file_path = '/mnt/data/msmarco-test2019-queries.tsv'
    
    # Initialize an empty dictionary to store the queries
    queries_dict = {}
    
    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Strip the newline character and split the line by the tab delimiter
            query_id, query = line.strip().split('\t')
            # Add the query_id and query to the dictionary
            queries_dict[query_id] = query
    
    # (Optional) Print the size of the dictionary and a few example entries
    if is_need_print:
        print(f'Total queries parsed: {len(queries_dict)}')
        for query_id, query in list(queries_dict.items())[:5]:
            print(f'{query_id}: {query}')
    return queries_dict

query_map_mm2019 = get_query_map('./data/msmarco-test2019-queries.tsv', True)

Total queries parsed: 200
1108939: what slows down the flow of blood
1112389: what is the county for grand rapids, mn
792752: what is ruclip
1119729: what do you do when you have a nosebleed from having your nose
1105095: where is sugar lake lodge located


In [10]:
import random

def get_ground_truth_xy_list_by_rule(file_path, topic_id, sample_rule=[], black_list=[], my_seed=114514):
    # Initialize a dictionary to store topic data
    topic_data = {}

    # Read the file and populate the dictionary
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            current_topic_id, _, doc_id, relevance_score = int(parts[0]), parts[1], parts[2], int(parts[3])

            # Only process entries for the specified topic_id and skip docs in black_list
            if (current_topic_id == topic_id or str(current_topic_id) == str(topic_id)) and doc_id not in black_list:
                if relevance_score not in topic_data:
                    topic_data[relevance_score] = []
                topic_data[relevance_score].append(doc_id)

    # Check if documents exist for each relevance score in sample_rule
    result_docs = []
    score_counters = {}

    # Randomly sample documents according to the sample_rule
    for score in sample_rule:
        if score not in score_counters:
            score_counters[score] = 0
        if score in topic_data:
            available_docs = [doc for doc in topic_data[score] if doc not in black_list]
            random.seed(my_seed)  # Set the seed for reproducibility
            random.shuffle(available_docs)  # Shuffle to randomize the selection

            # Take the first available document ID that hasn't been taken yet for the score
            if available_docs:
                selected_doc_id = available_docs[0]  # Select the first document after shuffling
                result_docs.append({"docid": selected_doc_id, "qrel": score})
                topic_data[score].remove(selected_doc_id)  # Remove the selected document to avoid re-selection
            else:
                raise ValueError(f"No more documents found for topic {topic_id} with relevance score {score}")
        else:
            raise ValueError(f"No documents available for relevance score {score}")

    return result_docs

In [11]:
import random

def get_ground_truth_xy_list_random(file_path, topic_id, sample_size, my_seed=114514):
    # Initialize a dictionary to store documents by their relevance score
    topic_data = {}
    #print(topic_id)
    # Read the file and populate the dictionary
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            current_topic_id, _, doc_id, relevance_score = int(parts[0]), parts[1], parts[2], int(parts[3])
            #print(current_topic_id)
            # Only process entries for the specified topic_id
            if current_topic_id == topic_id or str(current_topic_id) == str(topic_id):
                if relevance_score not in topic_data:
                    topic_data[relevance_score] = []
                topic_data[relevance_score].append(doc_id)

    # Collect all documents across all scores for the topic
    all_docs = []
    for score, docs in topic_data.items():
        for doc in docs:
            all_docs.append({"docid": doc, "qrel": score})
    #print(all_docs)
    # Check if there are enough documents to sample
    if len(all_docs) < sample_size:
        raise ValueError(f"Not enough documents to sample: required {sample_size}, available {len(all_docs)}")

    # Set the seed for reproducibility and sample documents
    random.seed(my_seed)
    selected_docs = random.sample(all_docs, sample_size)

    return selected_docs
    
    

In [12]:
def get_all_doc_id_by_topic(file_path, topic_id):
    # Initialize a dictionary to store topic data
    result_docs = {}

    # Read the file and populate the dictionary
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            current_topic_id, _, doc_id, relevance_score = int(parts[0]), parts[1], parts[2], int(parts[3])

            # Only process entries for the specified topic_id and skip docs in black_list
            if (current_topic_id == topic_id or str(current_topic_id) == str(topic_id)):
                result_docs[doc_id] = relevance_score

    return result_docs

In [13]:
def get_gpt_judge_response(batch_prompt, gpt_model="gpt-3.5-turbo"):
    # Placeholder for the API call
    # Note: Replace the next line with your actual API call
    #print(ROLE_DESCRIPTION)
    #print(f"Using model: {gpt_model}")
    os.environ['OPENAI_API_KEY'] = API_KEY
    client = OpenAI()
    response = None
    try:
        response = client.chat.completions.create(
          model=gpt_model,
          messages=[
            {
              "role": "system",
              "content":ROLE_DESCRIPTION_HEAD
            },
            {
                "role": "user",
                "content":batch_prompt
            }
        ],
          temperature=0,
          max_tokens=4096,
          top_p=1,
          frequency_penalty=0.5,
          presence_penalty=0
        )
        return response.choices[0].message.content
                
    except Exception as e:
        # Handle failures by throwing an error and printing it
        print(f"Request Failed: {e}")
        raise

In [14]:
def get_llama_chat_judge_response(batch_prompt, llama_model="llama-2-70b-chat"):
    input_dict = {
        "top_p": 1,
        "system_prompt": ROLE_DESCRIPTION_HEAD,
        "prompt":  batch_prompt,
        "temperature": 0,
        "max_new_tokens": 128,
        "min_new_tokens": -1,
        #"frequency_penalty":0.5,
        #"presence_penalty":0,
        "length_penalty" : 0.01
    }

    try:
        output = replicate_api.run(
            "meta/"+llama_model,
            input=input_dict
        )
        res = "".join(output)
        #print(res)
        return res
                
    except Exception as e:
        # Handle failures by throwing an error and printing it
        print(f"Request Failed: {e}")
        raise

In [15]:
def get_gemini_judge_response(batch_prompt, gemini_model="gemini-1.5-pro"):
    g_model = genai.GenerativeModel(gemini_model)
    try:
        response = g_model.generate_content(
            ROLE_DESCRIPTION_HEAD+batch_prompt,
            generation_config=genai.types.GenerationConfig(
                # Only one candidate for now.
                candidate_count=1,
                stop_sequences=["}"],
                max_output_tokens=1024,
                temperature=0,
            ),
        )
        result = response.text + "}"
        #print(result)
        return result
                
    except Exception as e:
        # Handle failures by throwing an error and printing it
        print(f"Request Failed: {e}")
        raise

In [16]:
import re
"""
def extract_key_value(text):
    # Define a pattern to match the key-value pairs
    pattern = re.compile(r'<relevance_(\d+)>(\d+(\.\d+)?)')
    
    # Find all matches in the text
    matches = pattern.findall(text)
    print(len(matches))
    # Convert matches to the desired dictionary format
    result = {f"{match[0]}": int(match[1]) for match in matches}
    
    return result
"""

def fix_string_format(str_dict):
    # 替换单引号为双引号
    str_dict = str_dict.replace("'", "\"")
    
    # 使用正则表达式匹配键值对，支持路径和数字作为键
    pattern = r'\"?([^\":]+)\"?\s*:\s*(\d+\.?\d*)'
    items = re.findall(pattern, str_dict)
    
    fixed_items = []
    for key, value in items:
        key = key.strip()
        value = value.strip()
        # 检查 key 是否已经被引号包裹
        if not (key.startswith("\"") and key.endswith("\"")):
            key = f"\"{key}\""
        fixed_items.append(f"{key}: {value}")
    
    # 将修正后的键值对组合成一个字符串
    fixed_str = "{ " + ", ".join(fixed_items) + " }"
    
    return fixed_str


In [17]:
def get_llm_judge_response(batch_prompt, llm_model):
    if llm_model == "gpt-3.5-turbo" or llm_model == "gpt-4o" or llm_model == "gpt-4o-mini":
        return get_gpt_judge_response(batch_prompt, llm_model)
    elif llm_model == "llama-2-13b-chat" or llm_model == "llama-2-70b-chat" or llm_model == "meta-llama-3-70b-instruct":
         return get_llama_chat_judge_response(batch_prompt, llm_model)
    elif llm_model == "gemini-1.5-pro":
        return get_gemini_judge_response(batch_prompt, llm_model)

In [18]:
import os
import json

def read_augmented_docs(root_dir):
    result = {}
    # 遍历 root_dir 下的所有文件和目录
    for root, dirs, files in os.walk(root_dir):
        parts = root.split(os.sep)
        if len(parts) > 4:
            collection = parts[2]
            topic = parts[3]
            score = parts[4]
            # 确保字典中有相应的键
            if collection not in result:
                result[collection] = {}
            if topic not in result[collection]:
                result[collection][topic] = {}
            if score not in result[collection][topic]:
                result[collection][topic][score] = []
            # 添加文档ID
            for file in files:
                if file == '.DS_Store':
                    continue  # 跳过 .DS_Store 文件
                doc_id = f"{collection}/{topic}/{score}/{file}"
                result[collection][topic][score].append(doc_id)

    # 将结果转换为 JSON 格式
    #return json.dumps(result, indent=4)
    return result

# 使用示例
#root_directory = './gen_docs'
#output = read_augmented_docs(root_directory)
#print(output)

def get_augmented_ground_truth_xy_list(topic_id, head_rule, augmented_rel_map, user_seed=None):
    if user_seed is not None:
        random.seed(user_seed)  # 设置随机种子以便测试或复现结果

    result = []
    seen_docs = set()  # 用于记录已选择的文档，避免重复

    # 遍历 head_rule 中的每个分数
    for score in head_rule:
        possible_docs = augmented_rel_map[topic_id][str(score)]  # 取得该分数下的所有文档列表
        filtered_docs = [doc for doc in possible_docs if doc not in seen_docs]  # 过滤已选择的文档

        if not filtered_docs:
            raise ValueError(f"No more unique documents available for score {score} and topic {topic_id}")

        selected_doc_id = random.choice(filtered_docs)  # 随机选择一个文档
        seen_docs.add(selected_doc_id)  # 标记为已选择

        result.append({"docid": selected_doc_id, "qrel": score})

    return result
    
    

In [19]:
import time
import random
from tqdm import tqdm
import re

def judge_docs_by_topic(qrel_file_path, query_map, augmented_doc_map, topic_id, head_rule, tail_length, turn_number, llm_model="gpt-4o-mini",user_seed=None):
    #print(rel_judge_prompt)
    flag_success = False
    retry = 0
    value_dict = {}
    
    while flag_success == False and retry < RETRY_MAX:
        if user_seed:
            myseed = user_seed
        else:
            myseed =  114 * turn_number + 514 + retry
        list_for_judge_tail = get_ground_truth_xy_list_random(qrel_file_path, topic_id,  tail_length,  my_seed=myseed)
        tail_id_list =  [doc['docid'] for doc in list_for_judge_tail]
        
        list_for_judge_head = get_augmented_ground_truth_xy_list(topic_id, head_rule, augmented_doc_map, user_seed=myseed)
        list_for_judge_all = list_for_judge_head + list_for_judge_tail
        query = query_map[topic_id]
        
        #ground_truth_map = {}
        #for doc in list_for_judge_tail:
        #    ground_truth_map[doc['docid']] = doc['qrel']
        #print(ground_truth_map)
        #print(list_for_judge_all)
        rel_judge_list_prompt = create_rel_judge_prompt(query, list_for_judge_all)  
        try:
            result = get_llm_judge_response(rel_judge_list_prompt, llm_model)
            #print(result)
            try:
                # 修正字符串格式并将其转换为字典
                fixed_str = fix_string_format(result)
                value_dict = json.loads(fixed_str)
                if len(list_for_judge_all) == len(value_dict.keys()):
                    flag_success = True
                else:
                    print(f"Unmatched length found: {len(list_for_judge_all)} and {len(value_dict.keys())} topic {topic_id} turn {turn_number} retry{retry}")
            except Exception as e:
                print(f"Error when parsing result: topic {topic_id} turn {turn_number} retry {retry}")
                print(e)
                print(result)
        except Exception as e:
            print(f"Error when requesting api. topic:{topic_id} turn {turn_number} retry {retry}")
            print(e)
        retry += 1
    if not flag_success:
        print(f"Turn failed: topic:{topic_id} turn {turn_number}")

    return value_dict, flag_success

In [20]:
'''
IS_WITH_EXPLAIN = False
with_explain = ""
if not IS_WITH_EXPLAIN:
    with_explain = "_no_explain"
'''
TOPIC_LIST_MAP = {
        "TRDL19":[
        "47923",
        "87452",
        "130510",
        "168216",
        "183378",
        "264014",
        "359349",
        "443396",
        "451602",
        "527433",
        "833860",
        "915593",
        "1106007",
        "1110199",
        "1112341",
        "1114646",
        "1114819",
        "1117099",
        "1124210",
        "1133167"
    ],
    "TRDL20":[
        
    ]
}

QREL_DIR_MAP = {
    "TRDL19": './data/2019qrels-pass.txt',
    "TRDL20": './data/2020qrels-pass.txt'
}

QUERY_DIR_MAP = {
    "TRDL19":'./data/msmarco-test2019-queries.tsv',
    "TRDL20": './data/msmarco-test2020-queries.tsv'
}

AUG_DOCS_DIR = './gen_docs'

In [21]:
import os
import json

def save_result(collection, gpt_model, struct_str, result, save_dir = f'./batch/unknown_collection/random/',custom_str=None):
    # 定义文件路径
    if save_dir:
        directory = save_dir
    if not custom_str:
        file_path = f'{directory}PredictScore_{gpt_model}_{struct_str}.json'
    else:
        file_path = f'{directory}PredictScore_{gpt_model}_{struct_str}_{custom_str}.json'

    # 检查路径是否存在，如果不存在则创建
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # 保存结果到文件
    with open(file_path, 'w') as file:
        json.dump(result, file)

In [22]:
def load_result(collection, gpt_model, gt_str, custom_str=None):
    # 定义文件路径
    directory = f'./batch/{collection}/new/'
    file_path = f'{directory}predict_score_by_{gpt_model}_{gt_str}_{custom_str}.json'
    result_map = {}
    # 读取 JSON 文件并保存到变量 result_map

    with open(file_path, 'r', encoding='utf-8') as file:
        result_map = json.load(file)
    #except:
    #    print("Something went wrong")
        
    return result_map
    

In [23]:
JUDGE_LOOP_MAX = 50
RETRY_MAX = 5

from tqdm import tqdm
def run_min_threshold_judge(collection, head_length, tail_length, low_val=0, high_val=3, gpt_model="gpt-4o-mini", save_dir="" ,custom_str = None):
    
    query_map = get_query_map(QUERY_DIR_MAP[collection])
    qrel_file_dir = QREL_DIR_MAP[collection]
    aug_docs_map = read_augmented_docs(AUG_DOCS_DIR )
    current_aug_docs_map = aug_docs_map[collection]
    result_map = {}
    failed_turn = []
    # 创建进度条，总的迭代次数为主题数量乘以循环次数
    total_iterations = len(TOPIC_LIST_MAP[collection]) * JUDGE_LOOP_MAX
    with tqdm(total=total_iterations) as pbar:
        for topic in TOPIC_LIST_MAP[collection]:
            for turn in range(JUDGE_LOOP_MAX):
                head_rule = generate_random_list(HEAD_LEN,1,2)
                head_rule_high = get_replaced_list(head_rule, high_val,1)
                head_rule_low = get_replaced_list(head_rule, low_val,1)
                #tail_rule = generate_random_list(tail_length,0,3)
                #print(f"Processing {topic}")
                key = f"{topic}_{turn}"
                score_by_gpt_high_threshold, flag_high = judge_docs_by_topic(qrel_file_dir, query_map, current_aug_docs_map, topic, 
                                                                  head_rule_high, tail_length , turn, gpt_model)
                score_by_gpt_low_threshold, flag_low = judge_docs_by_topic(qrel_file_dir, query_map, current_aug_docs_map, topic, 
                                                                  head_rule_low, tail_length , turn, gpt_model)
                #print(score_by_gpt_high_threshold)
                #high_threshold_str = 'high_predict'
                #low_threshold_str = 'low_predict'
                #gt_str = 'gt'
                result_map[key] = {
                    "model": gpt_model,
                    "high_rule": head_rule_high,
                    "low_rule": head_rule_low,
                    "head_length": head_length,
                    "tail_length": tail_length,
                    'high_predict': score_by_gpt_high_threshold,
                    'low_predict': score_by_gpt_low_threshold
                }
                if not (flag_high and flag_low):
                    failed_turn.append((topic, turn))
                pbar.update(1)
    high_head_str = ''.join(str(num) for num in head_rule_high)
    low_head_str = ''.join(str(num) for num in head_rule_low)
    save_name_str = f'hr_{high_head_str}_lr_{low_head_str}_tl_{tail_length}_turn{JUDGE_LOOP_MAX}'
    if custom_str:
        save_name_str  += custom_str
    if not save_dir:
        save_dir = f'./batch/{collection}/random/'
    save_result(collection, gpt_model, save_name_str, result_map, save_dir)
    return result_map, failed_turn
        

In [24]:
def check_length(result_map, l):
    # 遍历双重字典并检查每个 value 字典的 keys 数量
    for topic, thresholds in result_map.items():
        for threshold, value_dict in thresholds.items():
            if len(value_dict) != l:
                print(f'Topic: {topic}, Threshold: {threshold}')

In [25]:

import random

def generate_random_list(list_len,val_min,val_max):
    # 使用列表推导生成随机数组
    return [random.randint(val_min, val_max) for _ in range(list_len)]


def get_replaced_list(original_list, target_value, replace_rate=0.5):
    # 计算要替换的元素数量
    replace_count = int(len(original_list) * replace_rate)
    #print(f"replace {replace_count} documents")
    
    # 随机抽取要替换的索引
    indices_to_replace = random.sample(range(len(original_list)), replace_count)
    
    # 复制原列表以保持原始列表不变
    replaced_list = original_list[:]
    
    # 替换指定索引处的元素
    for index in indices_to_replace:
        replaced_list[index] = target_value
    
    return replaced_list



In [27]:
HEAD_LEN = 1
TAIL_LEN = 4

result_map_1, failed_turn_1 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "meta-llama-3-70b-instruct")

  0%|          | 0/1000 [00:01<?, ?it/s]

KeyboardInterrupt



In [79]:
HEAD_LEN = 2
TAIL_LEN = 8

result_map_2, failed_turn_2 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "meta-llama-3-70b-instruct")

100%|██████████| 1000/1000 [1:56:11<00:00,  6.97s/it]


In [80]:
HEAD_LEN = 1
TAIL_LEN = 9

result_map_3, failed_turn_3 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "meta-llama-3-70b-instruct")

 50%|█████     | 504/1000 [48:35<49:18,  5.96s/it]  

Unmatched length found: 10 and 9 topic 833860 turn 4 retry0


100%|██████████| 1000/1000 [1:38:51<00:00,  5.93s/it]


In [81]:
HEAD_LEN = 1
TAIL_LEN = 4

result_map_4, failed_turn_4 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gpt-4o-mini")

100%|██████████| 1000/1000 [1:23:46<00:00,  5.03s/it]


In [82]:
HEAD_LEN = 2
TAIL_LEN = 8

result_map_5, failed_turn_5 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gpt-4o-mini")

 28%|██▊       | 277/1000 [27:46<1:13:11,  6.07s/it]

Unmatched length found: 10 and 9 topic 264014 turn 27 retry0


 51%|█████     | 511/1000 [51:43<43:55,  5.39s/it]  

Unmatched length found: 10 and 9 topic 833860 turn 11 retry0


 53%|█████▎    | 531/1000 [53:38<44:00,  5.63s/it]

Unmatched length found: 10 and 9 topic 833860 turn 31 retry0
Unmatched length found: 10 and 9 topic 833860 turn 31 retry0


 63%|██████▎   | 628/1000 [1:03:08<35:04,  5.66s/it]

Unmatched length found: 10 and 9 topic 1106007 turn 28 retry0


 68%|██████▊   | 675/1000 [1:07:29<31:30,  5.82s/it]

Unmatched length found: 10 and 9 topic 1110199 turn 25 retry0
Unmatched length found: 10 and 9 topic 1110199 turn 25 retry0


 71%|███████   | 710/1000 [1:10:58<26:26,  5.47s/it]

Unmatched length found: 10 and 9 topic 1112341 turn 10 retry0


 85%|████████▍ | 846/1000 [1:23:47<17:24,  6.78s/it]

Unmatched length found: 10 and 9 topic 1114819 turn 46 retry0
Unmatched length found: 10 and 9 topic 1114819 turn 46 retry0


100%|██████████| 1000/1000 [1:38:42<00:00,  5.92s/it]


In [83]:
HEAD_LEN = 1
TAIL_LEN = 9

result_map_6, failed_turn_6 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gpt-4o-mini")

 29%|██▉       | 291/1000 [29:20<1:11:22,  6.04s/it]

Unmatched length found: 10 and 9 topic 264014 turn 41 retry0


 34%|███▎      | 337/1000 [34:08<1:24:02,  7.61s/it]

Unmatched length found: 10 and 9 topic 359349 turn 37 retry0


 51%|█████     | 511/1000 [53:15<1:11:23,  8.76s/it]

Unmatched length found: 10 and 9 topic 833860 turn 11 retry0
Unmatched length found: 10 and 9 topic 833860 turn 11 retry0


 53%|█████▎    | 531/1000 [55:34<56:48,  7.27s/it]  

Unmatched length found: 10 and 9 topic 833860 turn 31 retry0
Unmatched length found: 10 and 9 topic 833860 turn 31 retry0


 64%|██████▎   | 636/1000 [1:07:49<40:16,  6.64s/it]

Unmatched length found: 10 and 9 topic 1106007 turn 36 retry0


 68%|██████▊   | 675/1000 [1:11:56<32:59,  6.09s/it]

Unmatched length found: 10 and 9 topic 1110199 turn 25 retry0
Unmatched length found: 10 and 9 topic 1110199 turn 25 retry0


 73%|███████▎  | 730/1000 [1:18:09<28:44,  6.39s/it]

Unmatched length found: 10 and 9 topic 1112341 turn 30 retry0
Unmatched length found: 10 and 9 topic 1112341 turn 30 retry0


 85%|████████▍ | 846/1000 [1:31:09<16:53,  6.58s/it]

Unmatched length found: 10 and 9 topic 1114819 turn 46 retry0
Unmatched length found: 10 and 9 topic 1114819 turn 46 retry0


 99%|█████████▉| 988/1000 [1:47:32<01:20,  6.70s/it]

Unmatched length found: 10 and 9 topic 1133167 turn 38 retry0
Unmatched length found: 10 and 9 topic 1133167 turn 38 retry0


100%|██████████| 1000/1000 [1:48:59<00:00,  6.54s/it]


In [86]:
HEAD_LEN = 1
TAIL_LEN = 4

result_map_7, failed_turn_7 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gpt-4o")

100%|██████████| 1000/1000 [1:46:55<00:00,  6.42s/it] 


In [87]:
HEAD_LEN = 1
TAIL_LEN = 9

result_map_8, failed_turn_8 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gpt-4o")

100%|██████████| 1000/1000 [1:32:18<00:00,  5.54s/it]


In [88]:
HEAD_LEN = 2
TAIL_LEN = 8

result_map_9, failed_turn_9 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gpt-4o")

100%|██████████| 1000/1000 [1:35:22<00:00,  5.72s/it]


In [27]:
HEAD_LEN = 1
TAIL_LEN = 4

result_map_10, failed_turn_10 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gemini-1.5-pro")

100%|██████████| 1000/1000 [1:54:15<00:00,  6.86s/it]  


In [28]:
HEAD_LEN = 2
TAIL_LEN = 8

result_map_11, failed_turn_11 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gemini-1.5-pro")

100%|██████████| 1000/1000 [2:11:29<00:00,  7.89s/it] 


In [29]:
HEAD_LEN = 1
TAIL_LEN = 9

result_map_12, failed_turn_12 = run_min_threshold_judge("TRDL19", HEAD_LEN, TAIL_LEN , 0, 3, "gemini-1.5-pro")

100%|██████████| 1000/1000 [2:03:25<00:00,  7.41s/it] 
