In [1]:
import json
import random
import re
import time
import os
from typing import List, Dict, Any, Optional, Annotated, Callable, Tuple
from difflib import SequenceMatcher # Option 1 for basic similarity
import heapq # For finding top N similar items efficiently
import numpy as np # For calculating mean similarity
from rouge_score import rouge_scorer
from openai import OpenAI, APIError
from function_schema import get_function_schema

In [2]:
# --- Friendli AI Client Setup ---
token = os.getenv("FRIENDLI_TOKEN")
if not token:
    print("Error: FRIENDLI_TOKEN environment variable not set.")
    print("Please set the environment variable or replace '<YOUR_FRIENDLI_TOKEN>' in the code.")
    token = "<YOUR_FRIENDLI_TOKEN>" # Placeholder

if token == "<YOUR_FRIENDLI_TOKEN>":
     print("Warning: Using placeholder Friendli token. LLM calls will likely fail.")

client = OpenAI(
    base_url = "https://api.friendli.ai/serverless/v1",
    api_key = token
)
LLM_MODEL_NAME = "deepseek-r1" # Specify the model

In [3]:
def get_weather(
    city: Annotated[str, "The city to get the weather for"],
    unit: Annotated[Optional[str], "The unit to return the temperature in"] = "celcius",
) -> str:
    """Returns the weather for the given city."""
    return f"Weather for {city} is 20°C"

def get_news(
    topic: Annotated[str, "The topic to get news for"],
    source: Annotated[Optional[str], "The source to get news from"] = None,
) -> str:
    """Returns the news for the given topic."""
    return f"News for {topic} from {source if source else 'all sources'}"

def get_current_location() -> str:
    """Returns the current location of the user."""
    return "Current location is Seoul, South Korea"

tools = [
    get_weather,
    get_news,
    get_current_location,
]

tool_schemas = [get_function_schema(tool) for tool in tools]
tool_schemas_json = json.dumps(tool_schemas, indent=2)


In [4]:
import json
from typing import Annotated, Optional, List, Callable, Dict, Any

# 도구 함수 정의

def create_calendar_event(
    summary: Annotated[str, "Title of the event to be added (default: 'New Event')"],
    start_time: Annotated[str, "Start date and time of the event (format: 'yyyy-MM-dd HH:mm')"],
    end_time: Annotated[str, "End date and time of the event (format: 'yyyy-MM-dd HH:mm')"]
) -> None:
    """Creates a new calendar event."""
    # 내부 구현은 생략 (pass)
    # 실제 구현 시에는 캘린더 API를 호출하여 이벤트를 생성합니다.
    # 예: google_calendar.create_event(summary=summary, start=start_time, end=end_time)
    pass

def fetch_calendar_events(
    start_date: Annotated[str, "Start date of the search range (format: yyyy-MM-dd)"],
    end_date: Annotated[str, "End date of the search range (format: yyyy-MM-dd)"]
) -> str:
    """
    Retrieves calendar events within a specified date range.
    Requires authorization first. If not authorized, should call authorize_calendar_access.
    Returns a JSON string representing the events or an error message.
    """
    # 내부 구현은 생략 (pass)
    # 실제 구현 시에는 인증 상태 확인 후 캘린더 API를 호출합니다.
    # is_authorized = check_auth()
    # if not is_authorized:
    #     return json.dumps({"message": "You need to authorize the assistant to access your calendar."})
    # try:
    #     events = calendar_api.fetch_events(start_date, end_date)
    #     return json.dumps(events)
    # except Exception as e:
    #     return json.dumps({"message": f"Error fetching calendar events: {e}"})
    pass

def authorize_calendar_access() -> None:
    """
    Initiates the authorization process for calendar access.
    Must be called first before using calendar-related tools like fetch_calendar_events or create_calendar_event if not already authorized.
    """
    # 내부 구현은 생략 (pass)
    # 실제 구현 시에는 OAuth 플로우 등을 시작하여 사용자 인증/인가를 받습니다.
    # print("Please visit <authorization_url> to authorize calendar access.")
    pass

def web_search(
    query: Annotated[str, "The query to search for on the web."]
) -> str:
    """
    Searches the web (DuckDuckGo) for the given query.
    Returns a JSON string containing search results.
    """
    # 내부 구현은 생략 (pass)
    # 실제 구현 시에는 웹 검색 라이브러리나 API를 호출합니다.
    # results = duckduckgo_search(query)
    # return json.dumps(results)
    pass


tools = [
    create_calendar_event,
    fetch_calendar_events,
    authorize_calendar_access,
    web_search,
]
tool_schemas = [get_function_schema(tool) for tool in tools]
tool_schemas_json = json.dumps(tool_schemas, indent=2)

In [5]:
# --- Configuration ---
QUERIES_FILENAME = "diverse_queries_with_scores_v4.json" # New filename for this version
NUM_GENERATION_TURNS = 3
QUERIES_TO_GENERATE_PER_TURN = 10
REQUEST_BATCH_SIZE_PER_TURN = 15
MAX_ATTEMPTS_PER_TURN = 5
SIMILARITY_THRESHOLD = 0.8
TOP_N_SIMILAR = 10 # Alpaca stores top 10


In [6]:

# --- Helper Functions ---

def load_queries_with_scores(filename: str) -> List[Dict[str, Any]]:
    """Loads previously generated query objects from a JSON file."""
    if os.path.exists(filename):
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list) and all(isinstance(item, dict) and 'q' in item for item in data):
                    print(f"Loaded {len(data)} query objects from {filename}")
                    return data
                else:
                    print(f"Warning: Invalid format found in {filename}. Starting fresh.")
                    return []
        except (json.JSONDecodeError, IOError) as e:
            print(f"Error loading {filename}: {e}. Starting fresh.")
            return []
    else:
        print(f"No existing query file found ({filename}). Starting fresh.")
        return []

def save_queries_with_scores(query_objects: List[Dict[str, Any]], filename: str):
    """Saves the list of query objects to a JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            # Use indent=4 like Alpaca's output
            json.dump(query_objects, f, indent=4, ensure_ascii=False)
        print(f"Saved {len(query_objects)} query objects to {filename}")
    except IOError as e:
        print(f"Error saving queries to {filename}: {e}")

def is_valid_query_line(q_text: str) -> bool:
    """Checks if a single line looks like a valid user query."""
    q_text = q_text.strip()
    if not q_text: return False
    # More robust filtering based on common LLM reasoning/meta-commentary patterns
    if q_text.startswith(("- ", "* ", "Okay,", "First,", "Next,", "Now,", "Let me", "Wait,", "Also,", "###", "//", "```", "Queries", "That's", "This should", "Avoid", "Check for", "Example", "User Query:", "Generated Query:")): return False
    if q_text.endswith(":") or "→" in q_text: return False
    if re.match(r"^\d+\.", q_text): return False
    if len(q_text.split()) <= 1 and not re.search(r'[a-zA-Z]', q_text): return False
    if "/" in q_text and "." in q_text and " " not in q_text: return False
    # Filter lines that are likely descriptions of tools or parameters
    if any(tool_name in q_text.lower() for tool_name in ["get_weather", "get_news", "get_current_location"]):
        if "parameter" in q_text.lower() or "require" in q_text.lower() or "tool" in q_text.lower():
            return False
    return True

def remove_think_blocks(text: str) -> str:
    """Removes <think>...</think> blocks from the text."""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE)


In [7]:

# --- Step 1: Generate Diverse User Queries ('q') ---

def generate_candidate_qs_with_llm(
    tool_schemas_str: str,
    num_to_generate: int,
    existing_qs_list: List[str],
) -> List[str]:
    """Generates candidate 'q' strings using the LLM, removing think blocks."""
    
    system_prompt = f"""Your ONLY task is to generate realistic, diverse user queries or requests ('q') suitable for an AI assistant with access to specific tools. These queries should be answerable using the provided tools. Vary the complexity, phrasing (questions, commands), and the tools potentially required.

**CRITICAL INSTRUCTIONS:**
1.  Output ONLY the raw user queries.
2.  Each query MUST be on a new line.
3.  **ABSOLUTELY DO NOT** include:
    * Explanations, comments, or justifications.
    * Thinking processes, reasoning steps (including anything like `<think>...</think>`).
    * Numbered lists, bullet points, or any formatting other than one query per line.
    * Any text before the first query or after the last query.
"""

    examples_prompt = ""
    if existing_qs_list:
        sample_existing = random.sample(existing_qs_list, min(len(existing_qs_list), 5))
        examples_prompt = "Critically, avoid generating queries too similar to these examples:\n- " + "\n- ".join(sample_existing) + "\n\n"

    user_prompt = f"""Based on the following available tools: {tool_schemas_str}
Generate exactly {num_to_generate} diverse user queries ('q'). Remember to vary the required tools, complexity, and phrasing. {examples_prompt}Output ONLY the queries, one per line:"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    try:
        print(f"--- Calling LLM ({LLM_MODEL_NAME}) to generate ~{num_to_generate} queries ---")
        completion = client.chat.completions.create(
            model=LLM_MODEL_NAME, messages=messages, temperature=0.8, max_tokens=4096,
        )
        raw_llm_output = completion.choices[0].message.content
        print("--- LLM Response Received ---")
    except APIError as e:
        print(f"LLM API Error: {e}"); return []
    except Exception as e:
        print(f"An unexpected error occurred during LLM call: {e}"); return []
    cleaned_output = remove_think_blocks(raw_llm_output)
    candidate_qs = []
    raw_lines = cleaned_output.split('\n')
    print(f"--- Lines after removing <think> blocks: {len(raw_lines)} ---") # Debugging
    for line in raw_lines:
        clean_line = line.strip()
        if is_valid_query_line(clean_line):
            candidate_qs.append(clean_line)
        elif clean_line:
             print(f"Filtered out invalid line: '{clean_line}'")

    print(f"--- Parsed {len(candidate_qs)} potentially valid candidate queries ---")
    return candidate_qs





In [8]:
candidate_qs_strings = generate_candidate_qs_with_llm(
    tool_schemas_json,
    num_to_generate=5,
    existing_qs_list=[],
)

print(candidate_qs_strings) # Debugging



--- Calling LLM (deepseek-r1) to generate ~5 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 7 ---
--- Parsed 5 potentially valid candidate queries ---
['Set up a meeting with the marketing team on March 12th from 2 PM to 3:30 PM.', 'Show me all events between March 10 and March 15.', "Add a calendar entry for tomorrow's dentist appointment at 9 AM lasting one hour.", 'Search the web for recent breakthroughs in renewable energy storage.', "Check if I'm free between 10 AM and noon on March 20th."]


In [9]:
# 라이브러리 설치: pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
import torch # sentence-transformers가 내부적으로 사용

# 한국어 사전 훈련 모델 로드 (다양한 모델 사용 가능)
# 예시: 'jhgan/ko-sbert-sts', 'snunlp/KR-SBERT-V40K-klueNLI-augSTS' 등
# 모델 목록: https://huggingface.co/models?language=ko&library=sentence-transformers
print("Loading Korean Sentence Transformer model...")
# model = SentenceTransformer('jhgan/ko-sbert-sts')
model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
print("Model loaded.")

# 비교할 텍스트
references_ko = [
    "The cat was found under the bed",
    "The cat was under the bed"
]
candidate_ko = "The cat likes to eat Churu"

# 문장 임베딩 계산
# 참고: GPU 사용 가능 시 자동으로 활용하여 속도 향상
print("Encoding sentences...")
ref_embeddings = model.encode(references_ko, convert_to_tensor=True)
cand_embedding = model.encode(candidate_ko, convert_to_tensor=True)
print("Encoding complete.")

# 코사인 유사도 계산
# 후보 문장과 각 참조 문장 간의 유사도 계산
cosine_scores = util.pytorch_cos_sim(cand_embedding, ref_embeddings)

# 결과 출력
print("\n--- Sentence Embedding Cosine Similarity ---")
for i, score in enumerate(cosine_scores[0]): # cand_embedding은 하나이므로 [0] 인덱스 사용
    print(f"Candidate vs Reference {i+1}: {score.item():.4f}")

# 여러 참조 문장 중 가장 높은 유사도 선택
max_similarity = torch.max(cosine_scores[0]).item()
print(f"\nMaximum Similarity Score: {max_similarity:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


Loading Korean Sentence Transformer model...
Model loaded.
Encoding sentences...
Encoding complete.

--- Sentence Embedding Cosine Similarity ---
Candidate vs Reference 1: 0.7844
Candidate vs Reference 2: 0.7780

Maximum Similarity Score: 0.7844


In [10]:
# -*- coding: utf-8 -*-
import torch
from sentence_transformers import SentenceTransformer, util
from typing import List, Dict, Any
import numpy as np
import heapq # top_n_similar 계산을 위해 유지

# --- 모델 로딩 (애플리케이션 시작 시 한 번 수행하는 것이 이상적) ---
# 한국어 사전 훈련 모델 로드
# 사용 가능한 모델 예시: 'jhgan/ko-sbert-sts', 'snunlp/KR-SBERT-V40K-klueNLI-augSTS' 등
print("Loading Korean Sentence Transformer model...")
# model_name = 'jhgan/ko-sbert-sts'
model_name = 'snunlp/KR-SBERT-V40K-klueNLI-augSTS'
try:
    # GPU 사용 가능 시 자동으로 활용
    model = SentenceTransformer(model_name)
    print(f"Model '{model_name}' loaded successfully.")
    # 모델을 GPU로 이동 (사용 가능한 경우)
    if torch.cuda.is_available():
        model = model.to(torch.device("cuda"))
        print("Model moved to GPU.")
    else:
        print("GPU not available, using CPU.")
except Exception as e:
    print(f"Error loading Sentence Transformer model: {e}")
    model = None # 모델 로딩 실패 시 None으로 설정

# --- 개선된 필터링 함수 ---
def filter_and_score_qs_sentence_transformer(
    candidate_qs: List[str],
    existing_query_objects: List[Dict[str, Any]],
    model: SentenceTransformer, # 로드된 모델을 인자로 받음
    similarity_threshold: float = 0.8,
    top_n_similar: int = 10
) -> List[Dict[str, Any]]:
    """
    Filters candidate questions based on maximum semantic similarity using Sentence Transformers
    and scores the accepted ones.

    Args:
        candidate_qs: List of new candidate question strings.
        existing_query_objects: List of dictionaries, each representing an existing query
                                (must contain at least a 'q' key with the query string).
        model: The pre-loaded Sentence Transformer model.
        similarity_threshold: The maximum similarity score allowed for a candidate to be accepted.
        top_n_similar: The number of most similar existing questions to record for accepted candidates.

    Returns:
        List of dictionaries, each representing an accepted new query with similarity scores.
    """
    if not model:
        print("Error: Sentence Transformer model is not loaded. Cannot perform filtering.")
        return []

    if not candidate_qs:
        print("No candidate questions provided.")
        return []

    newly_accepted_query_objects = []
    existing_qs_list = [obj['q'] for obj in existing_query_objects]

    # --- 임베딩 계산 ---
    # 기존 질문 임베딩 (비어있지 않은 경우에만 계산)
    existing_embeddings = None
    if existing_qs_list:
        print(f"Encoding {len(existing_qs_list)} existing queries...")
        # 배치 처리 및 GPU 활용 (모델이 GPU에 있다면)
        existing_embeddings = model.encode(existing_qs_list, convert_to_tensor=True, show_progress_bar=True)
        print("Existing queries encoded.")

    # 후보 질문 임베딩
    print(f"Encoding {len(candidate_qs)} candidate queries...")
    candidate_embeddings = model.encode(candidate_qs, convert_to_tensor=True, show_progress_bar=True)
    print("Candidate queries encoded.")

    # 비교 대상 질문 목록 및 임베딩 (반복문 내에서 업데이트됨)
    # 초기에는 기존 질문들로 설정
    all_qs_strings_for_comparison = list(existing_qs_list)
    # .clone()을 사용하여 원본 existing_embeddings가 변경되지 않도록 함
    all_embeddings_for_comparison = existing_embeddings.clone() if existing_embeddings is not None else None

    print(f"\n--- Filtering {len(candidate_qs)} candidates for diversity against {len(all_qs_strings_for_comparison)} existing/accepted queries ---")

    # 후보 질문들을 순회하며 필터링
    for i, q_new in enumerate(candidate_qs):
        q_new_lower = q_new.lower()
        cand_embedding = candidate_embeddings[i] # 해당 후보의 미리 계산된 임베딩 사용

        # 1. 정확히 동일한 질문인지 확인 (소문자 기준)
        is_exact_duplicate = any(q_new_lower == q_old.lower() for q_old in all_qs_strings_for_comparison)
        if is_exact_duplicate:
            # print(f"Skipping exact duplicate: \"{q_new}\"") # 로그 출력 필요 시 주석 해제
            continue

        # 2. 기존/수락된 질문들과의 유사도 계산
        max_similarity = 0.0
        avg_similarity = 0.0
        similarities_list = [] # (score, query) 튜플 저장 리스트

        # 비교 대상 임베딩이 있는 경우에만 유사도 계산
        if all_embeddings_for_comparison is not None and all_embeddings_for_comparison.shape[0] > 0:
            # 코사인 유사도 계산 (후보 1개 vs 모든 비교 대상)
            # cand_embedding 차원을 [1, embedding_dim]으로 맞춰줌
            cosine_scores = util.pytorch_cos_sim(cand_embedding.unsqueeze(0), all_embeddings_for_comparison)[0] # 결과는 tensor([score1, score2, ...])

            # CPU로 이동 후 numpy 배열로 변환하여 사용 (GPU 메모리 절약 및 호환성)
            cosine_scores_cpu = cosine_scores.cpu().numpy()

            if cosine_scores_cpu.size > 0: # 유사도 점수가 계산된 경우
                max_similarity = np.max(cosine_scores_cpu)
                avg_similarity = np.mean(cosine_scores_cpu)
                # 유사도 점수와 해당 질문 텍스트를 묶어서 리스트 생성
                similarities_list = list(zip(cosine_scores_cpu, all_qs_strings_for_comparison))
            else: # 비교 대상은 있으나 어떤 이유로든 점수 계산이 안된 경우 (이론상 발생하기 어려움)
                max_similarity = 0.0
                avg_similarity = 0.0
                similarities_list = []

        # 3. Alpaca 스타일 필터링: 최대 유사도가 임계값보다 높으면 건너뛰기
        if max_similarity > similarity_threshold:
            print(f"Skipping (MaxSim {max_similarity:.3f} > {similarity_threshold}): \"{q_new}\"")
            continue

        # 4. 필터를 통과한 경우: 결과 저장 및 다음 비교를 위해 추가
        print(f"Accepting (MaxSim {max_similarity:.3f} <= {similarity_threshold}): \"{q_new}\"")

        # 가장 유사한 N개 찾기 (heapq 사용)
        most_similar_dict = {}
        if similarities_list:
            # 실제 top_n 개수는 유사도 리스트 크기와 top_n_similar 중 작은 값
            actual_top_n = min(top_n_similar, len(similarities_list))
            # 점수가 높은 순서대로 정렬 (튜플의 첫 번째 요소인 점수 기준)
            top_n = heapq.nlargest(actual_top_n, similarities_list, key=lambda item: item[0])
            # 딕셔너리로 변환 (소수점 4자리까지 반올림, 0.01 이하 점수는 제외)
            most_similar_dict = {q: round(float(score), 4) for score, q in top_n if float(score) > 0.01}


        # 새로운 질문 객체 생성
        new_obj = {
            "q": q_new,
            "max_similarity_score_against_all": round(float(max_similarity), 4), # float로 변환
            "avg_similarity_score": round(float(avg_similarity), 4), # float로 변환
            "most_similar_instructions": most_similar_dict # 기존 키 이름 유지
        }
        newly_accepted_query_objects.append(new_obj)

        # 다음 후보 비교를 위해 현재 수락된 질문과 임베딩을 비교 대상 목록에 추가
        all_qs_strings_for_comparison.append(q_new)
        # 임베딩 추가: all_embeddings_for_comparison이 None이면 새로 생성, 아니면 이어붙임
        cand_embedding_expanded = cand_embedding.unsqueeze(0) # 차원 맞추기 [1, embedding_dim]
        if all_embeddings_for_comparison is None:
            all_embeddings_for_comparison = cand_embedding_expanded
        else:
            all_embeddings_for_comparison = torch.cat((all_embeddings_for_comparison, cand_embedding_expanded), dim=0)

    print(f"\n--- Accepted {len(newly_accepted_query_objects)} new diverse query objects this round ---")
    return newly_accepted_query_objects

# --- 예시 사용법 ---
if __name__ == "__main__":
    if model: # 모델이 성공적으로 로드되었을 때만 실행
        # 기존에 존재하는 질문들 (예시)
        existing_queries = [
            {"q": "오늘 날씨 어때?", "other_data": 1},
            {"q": "서울 맛집 추천해줘", "other_data": 2},
            {"q": "파이썬으로 웹사이트 만드는 법 알려줘", "other_data": 3}
        ]

        # 새로 들어온 후보 질문들 (예시)
        candidate_queries = [
            "오늘 서울 날씨 알려줄래?", # 기존 질문과 유사
            "제주도 가볼만한 곳",
            "파이썬 웹 개발 방법", # 기존 질문과 유사
            "점심 메뉴 추천",
            "오늘 날씨 어때?" # 기존 질문과 정확히 일치
        ]

        # 필터링 및 스코어링 실행
        accepted_queries = filter_and_score_qs_sentence_transformer(
            candidate_qs=candidate_queries,
            existing_query_objects=existing_queries,
            model=model,
            similarity_threshold=0.8, # 유사도 임계값 (조정 가능)
            top_n_similar=3
        )

        print("\n--- Accepted Queries ---")
        for i, query_obj in enumerate(accepted_queries):
            print(f"{i+1}. Query: \"{query_obj['q']}\"")
            print(f"   Max Similarity: {query_obj['max_similarity_score_against_all']:.4f}")
            print(f"   Avg Similarity: {query_obj['avg_similarity_score']:.4f}")
            print(f"   Most Similar ({len(query_obj['most_similar_instructions'])}): {query_obj['most_similar_instructions']}")
            print("-" * 20)
    else:
        print("Skipping example usage because the model could not be loaded.")



Loading Korean Sentence Transformer model...
Model 'snunlp/KR-SBERT-V40K-klueNLI-augSTS' loaded successfully.
Model moved to GPU.
Encoding 3 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 99.29it/s]


Existing queries encoded.
Encoding 5 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 131.47it/s]

Candidate queries encoded.

--- Filtering 5 candidates for diversity against 3 existing/accepted queries ---
Accepting (MaxSim 0.655 <= 0.8): "오늘 서울 날씨 알려줄래?"
Accepting (MaxSim 0.527 <= 0.8): "제주도 가볼만한 곳"
Accepting (MaxSim 0.757 <= 0.8): "파이썬 웹 개발 방법"
Accepting (MaxSim 0.476 <= 0.8): "점심 메뉴 추천"

--- Accepted 4 new diverse query objects this round ---

--- Accepted Queries ---
1. Query: "오늘 서울 날씨 알려줄래?"
   Max Similarity: 0.6547
   Avg Similarity: 0.4588
   Most Similar (3): {'오늘 날씨 어때?': 0.6547, '서울 맛집 추천해줘': 0.5066, '파이썬으로 웹사이트 만드는 법 알려줘': 0.2152}
--------------------
2. Query: "제주도 가볼만한 곳"
   Max Similarity: 0.5267
   Avg Similarity: 0.2935
   Most Similar (3): {'서울 맛집 추천해줘': 0.5267, '오늘 날씨 어때?': 0.2773, '오늘 서울 날씨 알려줄래?': 0.2509}
--------------------
3. Query: "파이썬 웹 개발 방법"
   Max Similarity: 0.7572
   Avg Similarity: 0.2394
   Most Similar (3): {'파이썬으로 웹사이트 만드는 법 알려줘': 0.7572, '서울 맛집 추천해줘': 0.1332, '제주도 가볼만한 곳': 0.125}
--------------------
4. Query: "점심 메뉴 추천"
   Max Similarity: 0.4




In [11]:


# --- Main Execution Logic ---
if __name__ == "__main__":

    if not model:
        print("Sentence Transformer model is not loaded. Exiting.")
        exit() # 모델 없이는 실행 불가

    # 이전에 승인된 쿼리 객체 로드
    accepted_query_objects = load_queries_with_scores(QUERIES_FILENAME)
    initial_query_count = len(accepted_query_objects)
    # 전체 목표 계산: 초기 개수 + (턴 수 * 턴당 목표 개수)
    overall_target = initial_query_count + (NUM_GENERATION_TURNS * QUERIES_TO_GENERATE_PER_TURN)

    print(f"Starting Generation Process.")
    print(f"Initial query objects loaded: {initial_query_count}")
    print(f"Targeting {QUERIES_TO_GENERATE_PER_TURN} new queries per turn for {NUM_GENERATION_TURNS} turns.")
    print(f"Overall target: {overall_target} query objects.")
    print(f"Using Similarity Threshold (Sentence Transformer): {SIMILARITY_THRESHOLD}")
    print("-" * 30)

    total_added_this_session = 0

    # 지정된 턴 수만큼 반복
    for turn in range(1, NUM_GENERATION_TURNS + 1):
        print(f"\n=== Turn {turn}/{NUM_GENERATION_TURNS} ===")
        target_for_this_turn = QUERIES_TO_GENERATE_PER_TURN # 이번 턴에서 추가할 목표 개수
        added_in_this_turn = 0 # 이번 턴에서 실제로 추가된 개수
        attempts_this_turn = 0 # 이번 턴에서의 시도 횟수

        # LLM 프롬프트에 사용할 현재 쿼리 문자열 목록 (매 턴 시작 시 업데이트)
        # 주의: accepted_query_objects가 매우 커지면 이 목록 생성에 시간이 걸릴 수 있음
        current_qs_list_for_prompting = [obj['q'] for obj in accepted_query_objects]

        # 이번 턴의 목표를 달성하거나 최대 시도 횟수에 도달할 때까지 반복
        while added_in_this_turn < target_for_this_turn and attempts_this_turn < MAX_ATTEMPTS_PER_TURN:
            attempts_this_turn += 1
            print(f"\n--- Turn {turn} | Attempt {attempts_this_turn}/{MAX_ATTEMPTS_PER_TURN} ---")
            print(f"Current total query objects: {len(accepted_query_objects)}")
            print(f"Goal for this turn: {added_in_this_turn}/{target_for_this_turn} new queries")

            # 이번 시도에서 필요한 쿼리 개수 계산
            num_needed_for_turn = target_for_this_turn - added_in_this_turn
            # 필요한 개수보다 약간 더 많이 생성 요청 (필터링으로 일부 탈락될 것을 대비)
            num_to_generate_this_attempt = min(REQUEST_BATCH_SIZE_PER_TURN, num_needed_for_turn + 5)

            # LLM을 사용하여 후보 쿼리 문자열 생성
            candidate_qs_strings = generate_candidate_qs_with_llm(
                tool_schemas_json,
                num_to_generate=num_to_generate_this_attempt,
                # LLM에는 현재까지 승인된 쿼리 문자열 목록만 전달
                existing_qs_list=current_qs_list_for_prompting,
            )

            # LLM이 유효한 후보를 반환하지 않은 경우
            if not candidate_qs_strings:
                print("LLM did not return any valid candidate queries or an error occurred. Retrying after delay...")
                time.sleep(5) # 잠시 대기 후 재시도
                continue

            # *** 변경된 부분: Sentence Transformer 기반 필터링 함수 호출 ***
            # 후보 쿼리 필터링 및 점수 계산
            # 필터링 함수에는 전체 쿼리 객체 목록과 로드된 모델 전달
            new_query_objects = filter_and_score_qs_sentence_transformer(
                candidate_qs=candidate_qs_strings,
                existing_query_objects=accepted_query_objects, # 비교 대상은 현재까지 승인된 모든 객체
                model=model, # 로드된 Sentence Transformer 모델 전달
                similarity_threshold=SIMILARITY_THRESHOLD,
                top_n_similar=TOP_N_SIMILAR
            )
            # **********************************************************

            # 새로 승인된 쿼리 객체 추가
            added_now = 0
            for obj in new_query_objects:
                # 이번 턴의 목표 개수를 초과하지 않도록 확인
                if added_in_this_turn < target_for_this_turn:
                    accepted_query_objects.append(obj)
                    # 중요: 다음 LLM 호출 및 다음 필터링 시 사용될 목록에도 즉시 반영
                    # 이렇게 하면 동일 배치 내에서도 중복/유사성 검사가 더 정확해짐
                    current_qs_list_for_prompting.append(obj['q'])
                    added_in_this_turn += 1
                    added_now += 1
                else:
                    break # 이번 턴 목표 달성 시 중단

            print(f"Accepted {added_now} new diverse query objects in this attempt.")

            # 두 번째 시도부터는 진행 상황이 없으면 경고 출력
            if added_now == 0 and attempts_this_turn > 1:
                print("Warning: No new diverse queries accepted in this attempt.")

            # 이번 턴의 목표 달성 여부 확인
            if added_in_this_turn >= target_for_this_turn:
                print(f"--- Turn {turn} goal reached ({added_in_this_turn} new queries added). ---")
                break # 목표 달성 시 이번 턴의 시도 루프 종료

            time.sleep(1) # 시도 사이에 약간의 지연 시간

        # 각 턴 종료 시 (목표 달성 또는 최대 시도 도달 시) 업데이트된 목록 저장
        total_added_this_session += added_in_this_turn
        save_queries_with_scores(accepted_query_objects, QUERIES_FILENAME)
        print(f"--- End of Turn {turn}. Total query objects now: {len(accepted_query_objects)}. Added this turn: {added_in_this_turn}. ---")


    # --- 최종 결과 출력 ---
    print("-" * 30)
    print(f"Generation process completed after {NUM_GENERATION_TURNS} turns.")
    print(f"Total query objects generated or loaded: {len(accepted_query_objects)}")
    print(f"Total new query objects added in this session: {total_added_this_session}")
    print(f"Final results saved to {QUERIES_FILENAME}")

    print("\nFinal list of diverse query objects (showing last added marked with '*'):")
    # 이번 세션에서 추가된 쿼리 식별 시작 인덱스
    start_index = max(0, len(accepted_query_objects) - total_added_this_session)
    for i, obj in enumerate(accepted_query_objects):
         marker = "*" if i >= start_index else " " # 이번 세션 추가분 표시
         # 결과 객체의 키 이름 확인 (filter_and_score_qs_sentence_transformer 반환값 기준)
         similar_dict = obj.get('most_similar_instructions', {}) # 가장 유사한 지시사항 딕셔너리
         # 유사도 높은 항목들을 간결하게 표시 (쿼리 앞부분 + 점수)
         similar_items = [f"'{q[:30]}...':{s:.2f}" for q, s in similar_dict.items()]
         similar_str = ", ".join(similar_items) if similar_items else "{}" # 비어있으면 {} 표시

         # 최대 및 평균 유사도 점수 가져오기
         max_sim_score = obj.get('max_similarity_score_against_all', 0)
         avg_sim_score = obj.get('avg_similarity_score', 0)

         # 최종 출력 형식
         print(f"{marker} {i+1}. q: \"{obj['q']}\" (MaxS: {max_sim_score:.3f}, AvgS: {avg_sim_score:.3f}, TopSim: {similar_str})")


    # --- Placeholder for Step 2 (Generating Full Blueprints) ---
    print("\n--- Placeholder for Step 2: Generating Full Blueprints ---")
    # 이제 accepted_query_objects 리스트를 순회하며 각 'q' 필드를 사용하여
    # 전체 블루프린트를 생성하는 로직을 구현할 수 있습니다.
    # 예시:
    # final_blueprints = []
    # for query_obj in accepted_query_objects:
    #     q_final = query_obj['q']
    #     # blueprint_dict = generate_full_blueprint_for_q(q_final, tool_schemas_json, client)
    #     # if blueprint_dict: final_blueprints.append(blueprint_dict)
    # print(f"\n--- Generated {len(final_blueprints)} full Blueprints ---")



No existing query file found (diverse_queries_with_scores_v4.json). Starting fresh.
Starting Generation Process.
Initial query objects loaded: 0
Targeting 10 new queries per turn for 3 turns.
Overall target: 30 query objects.
Using Similarity Threshold (Sentence Transformer): 0.8
------------------------------

=== Turn 1/3 ===

--- Turn 1 | Attempt 1/5 ---
Current total query objects: 0
Goal for this turn: 0/10 new queries
--- Calling LLM (deepseek-r1) to generate ~15 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 17 ---
--- Parsed 15 potentially valid candidate queries ---
Encoding 15 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 96.48it/s]

Candidate queries encoded.

--- Filtering 15 candidates for diversity against 0 existing/accepted queries ---
Accepting (MaxSim 0.000 <= 0.8): "Add a meeting titled "Project Kickoff" for September 12th from 2:30 PM to 4:00 PM"
Accepting (MaxSim 0.667 <= 0.8): "Show my calendar events between October 1st and October 7th"
Accepting (MaxSim 0.788 <= 0.8): "Find recent news about quantum computing breakthroughs"
Accepting (MaxSim 0.729 <= 0.8): "Can you schedule a doctor's appointment on 2024-03-05 at 9:15 AM for 45 minutes?"
Skipping (MaxSim 0.807 > 0.8): "Check if I have any conflicts on November 18th between 1 PM and 3 PM"
Accepting (MaxSim 0.794 <= 0.8): "Search for top-rated Italian restaurants in Chicago and add a dinner reservation to my calendar"
Skipping (MaxSim 0.809 > 0.8): "How do I grant this app permission to manage my calendar?"
Accepting (MaxSim 0.764 <= 0.8): "Create an all-day event called "Anniversary" on 2024-05-21"
Accepting (MaxSim 0.792 <= 0.8): "What’s my availabili





--- Turn 1 | Attempt 2/5 ---
Current total query objects: 8
Goal for this turn: 8/10 new queries
--- Calling LLM (deepseek-r1) to generate ~7 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 9 ---
--- Parsed 7 potentially valid candidate queries ---
Encoding 8 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 115.93it/s]


Existing queries encoded.
Encoding 7 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 128.75it/s]

Candidate queries encoded.

--- Filtering 7 candidates for diversity against 8 existing/accepted queries ---
Accepting (MaxSim 0.796 <= 0.8): "Schedule a dentist appointment for next Thursday from 10:00 AM to 11:30 AM with the summary "Root Canal Checkup""
Skipping (MaxSim 0.851 > 0.8): "Can you show me all calendar entries between June 1st and June 15th?"
Skipping (MaxSim 0.820 > 0.8): "How do I authorize calendar access for the assistant to manage my events?"
Skipping (MaxSim 0.810 > 0.8): "Look up the current weather forecast in Tokyo this weekend"
Skipping (MaxSim 0.822 > 0.8): "Create a 2-hour block titled "Focus Time" starting today at 3:00 PM"
Skipping (MaxSim 0.831 > 0.8): "What are the latest trends in artificial intelligence research according to recent articles?"
Skipping (MaxSim 0.812 > 0.8): "Check if I have any meetings scheduled this Friday afternoon"

--- Accepted 1 new diverse query objects this round ---
Accepted 1 new diverse query objects in this attempt.






--- Turn 1 | Attempt 3/5 ---
Current total query objects: 9
Goal for this turn: 9/10 new queries
--- Calling LLM (deepseek-r1) to generate ~6 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 8 ---
--- Parsed 6 potentially valid candidate queries ---
Encoding 9 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 109.61it/s]


Existing queries encoded.
Encoding 6 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 128.33it/s]

Candidate queries encoded.

--- Filtering 6 candidates for diversity against 9 existing/accepted queries ---
Skipping (MaxSim 0.850 > 0.8): "Authorize calendar access and show my events from June 5th to June 10th"
Skipping (MaxSim 0.801 > 0.8): "Search for TED Talk schedules in September 2024 and create reminder events for three I might like"
Skipping (MaxSim 0.830 > 0.8): "Block off August 12th 2024 from 8:00 AM to 12:00 PM for "Strategic Planning Workshop""
Skipping (MaxSim 0.826 > 0.8): "Check if I have conflicts on July 4th and find local fireworks display times"
Skipping (MaxSim 0.810 > 0.8): "Enable calendar permissions then schedule a client call on 2024-11-14 from 14:30 to 15:15"
Accepting (MaxSim 0.784 <= 0.8): "What's the date range for CES 2025? Create placeholder events spanning those dates with "Tech Conference" summaries"

--- Accepted 1 new diverse query objects this round ---
Accepted 1 new diverse query objects in this attempt.
--- Turn 1 goal reached (10 new queries a




--- LLM Response Received ---
--- Lines after removing <think> blocks: 17 ---
--- Parsed 15 potentially valid candidate queries ---
Encoding 10 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 114.68it/s]


Existing queries encoded.
Encoding 15 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 100.59it/s]

Candidate queries encoded.

--- Filtering 15 candidates for diversity against 10 existing/accepted queries ---
Skipping (MaxSim 0.816 > 0.8): "Schedule a project review for 2024-04-10 from 2:00 PM to 3:30 PM"
Skipping (MaxSim 0.811 > 0.8): "List all my calendar entries between April 5th and April 12th, 2024"
Accepting (MaxSim 0.748 <= 0.8): "Search the web for the latest AI conference dates in 2024"
Skipping (MaxSim 0.876 > 0.8): "Book a dentist appointment on March 15th at 10:00 AM lasting one hour"
Accepting (MaxSim 0.770 <= 0.8): "Authorize access to manage my calendar events"
Skipping (MaxSim 0.839 > 0.8): "What's the weather forecast for New York City this weekend?"
Accepting (MaxSim 0.761 <= 0.8): "Create a marketing team meeting on June 5th, 2024 from 1:30 PM to 2:30 PM"
Skipping (MaxSim 0.806 > 0.8): "Do I have any events scheduled on July 4th, 2024?"
Skipping (MaxSim 0.810 > 0.8): "Look up the capital of Brazil and add a study session tomorrow at 4:00 PM"
Skipping (MaxSim 0.80





--- Turn 2 | Attempt 2/5 ---
Current total query objects: 14
Goal for this turn: 4/10 new queries
--- Calling LLM (deepseek-r1) to generate ~11 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 13 ---
--- Parsed 11 potentially valid candidate queries ---
Encoding 14 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 89.83it/s]


Existing queries encoded.
Encoding 11 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.77it/s]

Candidate queries encoded.

--- Filtering 11 candidates for diversity against 14 existing/accepted queries ---
Accepting (MaxSim 0.783 <= 0.8): "Check my schedule for any conflicts on November 22nd between 3 PM and 5 PM."
Skipping (MaxSim 0.825 > 0.8): "How do I grant permission for you to manage my calendar appointments?"
Skipping (MaxSim 0.864 > 0.8): "Find upcoming tech conferences in Berlin and add the DevOps Days event to my agenda."
Skipping (MaxSim 0.879 > 0.8): "Set up a recurring weekly team sync every Monday at 10 AM starting October 7th."
Skipping (MaxSim 0.898 > 0.8): "What’s on my calendar from tomorrow morning through Friday evening?"
Accepting (MaxSim 0.773 <= 0.8): "Search for volunteer opportunities in Austin and block time for Habitat for Humanity on September 21st."
Accepting (MaxSim 0.780 <= 0.8): "Can you see if I’m free on December 3rd at 2 PM for a client workshop?"
Skipping (MaxSim 0.855 > 0.8): "Look up citywide recycling dates and schedule reminders the day be





--- Turn 2 | Attempt 3/5 ---
Current total query objects: 18
Goal for this turn: 8/10 new queries
--- Calling LLM (deepseek-r1) to generate ~7 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 9 ---
--- Parsed 7 potentially valid candidate queries ---
Encoding 18 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 93.11it/s]


Existing queries encoded.
Encoding 7 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 78.34it/s]

Candidate queries encoded.

--- Filtering 7 candidates for diversity against 18 existing/accepted queries ---
Skipping (MaxSim 0.811 > 0.8): "Check my schedule from June 10th to June 15th and create a 'Vacation' event if available"
Accepting (MaxSim 0.796 <= 0.8): "Find the dates for Coachella 2024 and block my calendar with 'Music Festival'"
Skipping (MaxSim 0.853 > 0.8): "Set up a reminder for my mom's birthday on August 8th from 9 AM to 10 AM"
Accepting (MaxSim 0.734 <= 0.8): "What do I have scheduled on February 14th next year?"
Skipping (MaxSim 0.801 > 0.8): "Search for TED Talk events in Boston this fall and schedule attendance on the first available date"
Skipping (MaxSim 0.852 > 0.8): "Authorize calendar access and add a 'Financial Review' meeting next Friday at 3 PM"
Skipping (MaxSim 0.830 > 0.8): "Look up when the next solar eclipse occurs and mark it as 'Sky Event' in my calendar"

--- Accepted 2 new diverse query objects this round ---
Accepted 2 new diverse query objects i




--- LLM Response Received ---
--- Lines after removing <think> blocks: 17 ---
--- Parsed 15 potentially valid candidate queries ---
Encoding 20 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 85.90it/s]


Existing queries encoded.
Encoding 15 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 63.36it/s]

Candidate queries encoded.

--- Filtering 15 candidates for diversity against 20 existing/accepted queries ---
Accepting (MaxSim 0.799 <= 0.8): "Check if I have any meetings scheduled between March 10th and March 15th."
Skipping (MaxSim 0.965 > 0.8): "Find the dates for Coachella 2024 and add them to my calendar as "Music Festival"."
Skipping (MaxSim 0.853 > 0.8): "Block off next Thursday from 3 PM to 5 PM for a project deadline review."
Skipping (MaxSim 0.800 > 0.8): "What public holidays in Japan occur in Q3 2024? Create calendar entries for each."
Skipping (MaxSim 0.802 > 0.8): "Search for TED Talk livestream schedules this month and set reminders 10 minutes prior."
Skipping (MaxSim 0.870 > 0.8): "Schedule a webinar titled "AI Ethics Panel" on 2024-09-12 from 2:30 PM to 4:00 PM."
Skipping (MaxSim 0.809 > 0.8): "Show all events tagged as "Client Meeting" from April 1st to April 30th."
Skipping (MaxSim 0.843 > 0.8): "How long does standard shipping take from Germany? Add a reminder 5 





--- Turn 3 | Attempt 2/5 ---
Current total query objects: 21
Goal for this turn: 1/10 new queries
--- Calling LLM (deepseek-r1) to generate ~14 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 16 ---
--- Parsed 14 potentially valid candidate queries ---
Encoding 21 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 94.49it/s]


Existing queries encoded.
Encoding 14 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 58.98it/s]

Candidate queries encoded.

--- Filtering 14 candidates for diversity against 21 existing/accepted queries ---
Skipping (MaxSim 0.841 > 0.8): "Add a reminder for my doctor's appointment on 2024-03-15 at 2:30 PM."
Skipping (MaxSim 0.884 > 0.8): "Check if I have meetings scheduled between April 1st and April 5th, 2024."
Skipping (MaxSim 0.859 > 0.8): "Search for upcoming tech conferences in Berlin and save the dates in my calendar."
Skipping (MaxSim 0.859 > 0.8): "Schedule a monthly team sync every third Thursday starting May 2024 at 11 AM."
Skipping (MaxSim 0.867 > 0.8): "Find recipe ideas for gluten-free desserts and block kitchen time Saturday afternoon."
Skipping (MaxSim 0.924 > 0.8): "Authorize calendar access to check my existing commitments."
Skipping (MaxSim 0.812 > 0.8): "What's my schedule looking like from September 9th to September 13th?"
Skipping (MaxSim 0.835 > 0.8): "Set up a car maintenance reminder for June 10th at 8 AM with "Oil Change" as the title."
Skipping (MaxSim 0





--- Turn 3 | Attempt 3/5 ---
Current total query objects: 21
Goal for this turn: 1/10 new queries
--- Calling LLM (deepseek-r1) to generate ~14 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 16 ---
--- Parsed 14 potentially valid candidate queries ---
Encoding 21 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 77.88it/s]


Existing queries encoded.
Encoding 14 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 73.22it/s]

Candidate queries encoded.

--- Filtering 14 candidates for diversity against 21 existing/accepted queries ---
Skipping (MaxSim 0.821 > 0.8): "Add a team brainstorming session on April 22nd from 10:30 AM to 12 PM."
Skipping (MaxSim 0.845 > 0.8): "Search for upcoming developer conferences in Berlin during October 2024."
Accepting (MaxSim 0.771 <= 0.8): "When does E3 2024 start? Block those three days with "Gaming Expo" titles."
Skipping (MaxSim 0.818 > 0.8): "Can you check if I'm free between August 12th and 16th for a potential vacation?"
Skipping (MaxSim 0.830 > 0.8): "Schedule a car service appointment lasting 90 minutes on the next available weekday morning."
Skipping (MaxSim 0.850 > 0.8): "Find the dates for Paris Fashion Week 2024 and create tentative events labeled "Fashion Show Prep"."
Accepting (MaxSim 0.798 <= 0.8): "What's my schedule look like on June 7th after 1 PM?"
Skipping (MaxSim 0.838 > 0.8): "Create a recurring event for yoga every Wednesday at 7:30 AM beginning Septe





--- Turn 3 | Attempt 4/5 ---
Current total query objects: 24
Goal for this turn: 4/10 new queries
--- Calling LLM (deepseek-r1) to generate ~11 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 13 ---
--- Parsed 11 potentially valid candidate queries ---
Encoding 24 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 95.04it/s]


Existing queries encoded.
Encoding 11 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 50.80it/s]

Candidate queries encoded.

--- Filtering 11 candidates for diversity against 24 existing/accepted queries ---
Skipping (MaxSim 0.815 > 0.8): "Add a project deadline for 'Q4 Report' on 2024-11-25 from 3:00 PM to 4:00 PM"
Skipping (MaxSim 0.847 > 0.8): "Show my calendar entries between August 10th and August 17th"
Skipping (MaxSim 0.805 > 0.8): "Look up when Diwali starts this year and block those dates"
Skipping (MaxSim 0.821 > 0.8): "Set up a parent-teacher conference call on May 6th at 4:30 PM for 20 minutes"
Skipping (MaxSim 0.812 > 0.8): "Find current exhibitions at the Louvre Museum"
Skipping (MaxSim 0.868 > 0.8): "Check if my calendar is free on October 12th at 2:00 PM"
Skipping (MaxSim 0.811 > 0.8): "How do I enable calendar permissions for this assistant?"
Skipping (MaxSim 0.848 > 0.8): "Search for local coding bootcamp schedules and add the July intake dates"
Accepting (MaxSim 0.772 <= 0.8): "Create an all-day event titled 'Anniversary Vacation' on September 14th"
Skipping (Ma





--- Turn 3 | Attempt 5/5 ---
Current total query objects: 25
Goal for this turn: 5/10 new queries
--- Calling LLM (deepseek-r1) to generate ~10 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 12 ---
--- Parsed 10 potentially valid candidate queries ---
Encoding 25 existing queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 94.06it/s]


Existing queries encoded.
Encoding 10 candidate queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 51.03it/s]

Candidate queries encoded.

--- Filtering 10 candidates for diversity against 25 existing/accepted queries ---
Skipping (MaxSim 0.864 > 0.8): "Set up a team sync every Thursday at 4 PM starting April 4th for 30 minutes."
Skipping (MaxSim 0.802 > 0.8): "Search for upcoming developer conferences in Europe and block those dates."
Skipping (MaxSim 0.842 > 0.8): "Check if I'm available between May 7th morning and May 9th noon."
Skipping (MaxSim 0.867 > 0.8): "Authorize calendar permissions so you can view my upcoming appointments."
Skipping (MaxSim 0.801 > 0.8): "Add my dentist visit on 2024-06-12 from 10:00 to 10:45 as "Dental Checkup"."
Skipping (MaxSim 0.815 > 0.8): "Find when Ramadan ends this year and mark the celebration date."
Skipping (MaxSim 0.890 > 0.8): "What's booked on my calendar from tomorrow until Friday?"
Skipping (MaxSim 0.813 > 0.8): "Look up the Super Bowl 2025 date and create an event to watch it."
Skipping (MaxSim 0.824 > 0.8): "Schedule a car maintenance reminder for 




Saved 25 query objects to diverse_queries_with_scores_v4.json
--- End of Turn 3. Total query objects now: 25. Added this turn: 5. ---
------------------------------
Generation process completed after 3 turns.
Total query objects generated or loaded: 25
Total new query objects added in this session: 25
Final results saved to diverse_queries_with_scores_v4.json

Final list of diverse query objects (showing last added marked with '*'):
* 1. q: "Add a meeting titled "Project Kickoff" for September 12th from 2:30 PM to 4:00 PM" (MaxS: 0.000, AvgS: 0.000, TopSim: {})
* 2. q: "Show my calendar events between October 1st and October 7th" (MaxS: 0.667, AvgS: 0.667, TopSim: 'Add a meeting titled "Project ...':0.67)
* 3. q: "Find recent news about quantum computing breakthroughs" (MaxS: 0.788, AvgS: 0.730, TopSim: 'Show my calendar events betwee...':0.79, 'Add a meeting titled "Project ...':0.67)
* 4. q: "Can you schedule a doctor's appointment on 2024-03-05 at 9:15 AM for 45 minutes?" (MaxS: 0.7

for print the queries from the JSON file `diverse_queries_with_scores_v4.json`, you can use the following command:

```shell
jq -r '.[] | .q' diverse_queries_with_scores_v4.json
```