In [5]:
import json
import random
import re
import time
import os
from typing import List, Dict, Any, Optional, Annotated, Callable, Tuple
from difflib import SequenceMatcher # Option 1 for basic similarity
import heapq # For finding top N similar items efficiently

# Option 2: Install rouge-score for Alpaca-like filtering: pip install rouge-score
try:
    from rouge_score import rouge_scorer
    ROUGE_AVAILABLE = True
except ImportError:
    ROUGE_AVAILABLE = False
    print("Warning: rouge-score library not found. Falling back to basic similarity filtering.")
    print("Install it via 'pip install rouge-score' for better diversity filtering.")

# --- Ensure openai library is installed ---
try:
    from openai import OpenAI, APIError
except ImportError:
    print("Error: openai library not found. Please install it using 'pip install openai'")
    exit()

# --- Friendli AI Client Setup ---
token = os.getenv("FRIENDLI_TOKEN")
if not token:
    print("Error: FRIENDLI_TOKEN environment variable not set.")
    print("Please set the environment variable or replace '<YOUR_FRIENDLI_TOKEN>' in the code.")
    token = "<YOUR_FRIENDLI_TOKEN>" # Placeholder

if token == "<YOUR_FRIENDLI_TOKEN>":
     print("Warning: Using placeholder Friendli token. LLM calls will likely fail.")

client = OpenAI(
    base_url = "https://api.friendli.ai/serverless/v1",
    api_key = token
)
LLM_MODEL_NAME = "deepseek-r1" # Specify the model

# --- Tool Definitions and Schema Generation ---
# Assuming function_schema.py exists and works as expected
try:
    from function_schema import get_function_schema
except ImportError:
    print("Error: function_schema.py not found. Please ensure it's in the same directory.")
    # Define dummy function if not found, to allow script to run partially
    def get_function_schema(func):
        return {"name": func.__name__, "description": func.__doc__, "parameters": {}}


def get_weather(
    city: Annotated[str, "The city to get the weather for"],
    unit: Annotated[Optional[str], "The unit to return the temperature in"] = "celcius",
) -> str:
    """Returns the weather for the given city."""
    return f"Weather for {city} is 20°C"

def get_news(
    topic: Annotated[str, "The topic to get news for"],
    source: Annotated[Optional[str], "The source to get news from"] = None,
) -> str:
    """Returns the news for the given topic."""
    return f"News for {topic} from {source if source else 'all sources'}"

def get_current_location() -> str:
    """Returns the current location of the user."""
    # Example: Use context if available, otherwise a default
    # Current time is Sunday, May 4, 2025 at 7:49 PM KST.
    # Remember the current location is Seoul, Seoul, South Korea.
    return "Current location is Seoul, South Korea"

tools = [
    get_weather,
    get_news,
    get_current_location,
]

tool_schemas = [get_function_schema(tool) for tool in tools]
tool_schemas_json = json.dumps(tool_schemas, indent=2)


# --- Configuration ---
QUERIES_FILENAME = "diverse_queries_with_scores.json"
NUM_GENERATION_TURNS = 3
QUERIES_TO_GENERATE_PER_TURN = 10
REQUEST_BATCH_SIZE_PER_TURN = 15 # Ask for slightly more than needed per turn
MAX_ATTEMPTS_PER_TURN = 5
SIMILARITY_THRESHOLD = 0.7
TOP_N_SIMILAR = 5

# --- Helper Functions ---

def load_queries_with_scores(filename: str) -> List[Dict[str, Any]]:
    """Loads previously generated query objects from a JSON file."""
    if os.path.exists(filename):
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list) and all(isinstance(item, dict) and 'q' in item for item in data):
                    print(f"Loaded {len(data)} query objects from {filename}")
                    return data
                else:
                    print(f"Warning: Invalid format found in {filename}. Starting fresh.")
                    return []
        except (json.JSONDecodeError, IOError) as e:
            print(f"Error loading {filename}: {e}. Starting fresh.")
            return []
    else:
        print(f"No existing query file found ({filename}). Starting fresh.")
        return []

def save_queries_with_scores(query_objects: List[Dict[str, Any]], filename: str):
    """Saves the list of query objects to a JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(query_objects, f, indent=4, ensure_ascii=False)
        print(f"Saved {len(query_objects)} query objects to {filename}")
    except IOError as e:
        print(f"Error saving queries to {filename}: {e}")

def is_valid_query_line(q_text: str) -> bool:
    """
    Checks if a single line looks like a valid user query, filtering out metadata/reasoning patterns.
    This is applied *after* removing <think> blocks.
    """
    q_text = q_text.strip()
    if not q_text:
        return False
    # Filter out common non-query patterns often seen outside <think> blocks
    if q_text.startswith(("- ", "* ", "Okay,", "First,", "Next,", "Now,", "Let me", "Wait,", "Also,", "###", "//", "```", "Queries", "That's", "This should", "Avoid", "Check for")):
        return False
    if q_text.endswith(":") or "→" in q_text: # Filter out lines describing steps
        return False
    if re.match(r"^\d+\.", q_text): # Filter out numbered lists
         return False
    # Filter out lines that are just punctuation or single words (likely errors)
    if len(q_text.split()) <= 1 and not re.search(r'[a-zA-Z]', q_text):
         return False
    # Filter out lines that look like file paths or code snippets
    if "/" in q_text and "." in q_text and " " not in q_text: # Basic check for path-like strings
         return False
    # Add more specific rules if needed based on observed LLM outputs
    return True

def remove_think_blocks(text: str) -> str:
    """Removes <think>...</think> blocks from the text using regex."""
    # Non-greedy match, case-insensitive, works across newlines
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE)

# --- Step 1: Generate Diverse User Queries ('q') ---

def generate_candidate_qs_with_llm(
    tool_schemas_str: str,
    num_to_generate: int,
    existing_qs_list: List[str],
    llm_client: OpenAI
) -> List[str]:
    """Generates candidate 'q' strings using the LLM, removing think blocks."""

    # Strengthened system prompt
    system_prompt = f"""Your ONLY task is to generate realistic, diverse user queries or requests ('q') suitable for an AI assistant with access to specific tools. These queries should be answerable using the provided tools. Vary the complexity, phrasing (questions, commands), and the tools potentially required.

**CRITICAL INSTRUCTIONS:**
1.  Output ONLY the raw user queries.
2.  Each query MUST be on a new line.
3.  **ABSOLUTELY DO NOT** include:
    * Explanations, comments, or justifications.
    * Thinking processes, reasoning steps (including anything like `<think>...</think>`).
    * Numbered lists, bullet points, or any formatting other than one query per line.
    * Any text before the first query or after the last query.
"""

    examples_prompt = ""
    if existing_qs_list:
        sample_existing = random.sample(existing_qs_list, min(len(existing_qs_list), 5))
        examples_prompt = "Critically, avoid generating queries too similar to these examples:\n- " + "\n- ".join(sample_existing) + "\n\n"

    user_prompt = f"""Based on the following available tools:
```json
{tool_schemas_str}
```

Generate exactly {num_to_generate} diverse user queries ('q'). Remember to vary the required tools, complexity, and phrasing. {examples_prompt}Output ONLY the queries, one per line:"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    try:
        print(f"--- Calling LLM ({LLM_MODEL_NAME}) to generate ~{num_to_generate} queries ---")
        completion = llm_client.chat.completions.create(
            model=LLM_MODEL_NAME,
            messages=messages,
            temperature=0.8, # Slightly higher temp for more creative queries
            max_tokens=2048 # Increased max_tokens
        )
        raw_llm_output = completion.choices[0].message.content
        print("--- LLM Response Received ---")
    except APIError as e:
        print(f"LLM API Error: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during LLM call: {e}")
        return []

    # 1. Remove <think> blocks first
    cleaned_output = remove_think_blocks(raw_llm_output)

    # 2. Split into lines and apply line-level validity filter
    candidate_qs = []
    raw_lines = cleaned_output.split('\n')
    print(f"--- Lines after removing <think> blocks: {len(raw_lines)} ---")
    for line in raw_lines:
        clean_line = line.strip()
        if is_valid_query_line(clean_line): # Apply line filter here
            candidate_qs.append(clean_line)
        elif clean_line:
             print(f"Filtered out invalid line: '{clean_line}'") # Log filtered lines

    print(f"--- Parsed {len(candidate_qs)} potentially valid candidate queries ---")
    return candidate_qs


def calculate_similarity(q1: str, q2: str, scorer=None) -> float:
    """Calculates similarity score between two queries."""
    # Simple case: identical strings
    if q1 == q2: return 1.0
    # Avoid calculating similarity for very short strings (likely noise)
    if len(q1) < 5 or len(q2) < 5: return 0.0

    q1_lower = q1.lower()
    q2_lower = q2.lower()
    if scorer: # Use ROUGE if available
        try:
            # Ensure tokens are generated before scoring
            q1_tokens = scorer._tokenizer.tokenize(q1)
            q2_tokens = scorer._tokenizer.tokenize(q2)
            if not q1_tokens or not q2_tokens: return 0.0 # Handle empty token lists
            rouge_result = scorer._score_lcs(q1_tokens, q2_tokens)
            return rouge_result.fmeasure if rouge_result else 0.0
        except Exception as e:
            # print(f"Warning: ROUGE error between '{q1[:20]}...' and '{q2[:20]}...': {e}") # Optional debug
            return 0.0 # Error during ROUGE calculation
    else: # Fallback to SequenceMatcher
        return SequenceMatcher(None, q1_lower, q2_lower).ratio()


def filter_and_score_qs(
    candidate_qs: List[str],
    existing_query_objects: List[Dict[str, Any]], # Pass the full objects
    similarity_threshold: float = 0.7,
    top_n_similar: int = 5
) -> List[Dict[str, Any]]:
    """Filters candidate queries based on similarity and calculates scores."""
    newly_accepted_query_objects = []
    existing_qs_list = [obj['q'] for obj in existing_query_objects]
    # Comparison pool includes existing + newly accepted in this batch
    all_qs_strings_for_comparison = list(existing_qs_list)

    if not candidate_qs:
        return []

    scorer = None
    if ROUGE_AVAILABLE:
        try:
            scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
        except Exception as e:
            print(f"Error initializing RougeScorer: {e}. Falling back.")
            scorer = None

    print(f"--- Filtering {len(candidate_qs)} candidates for diversity against {len(all_qs_strings_for_comparison)} existing queries ---")
    for q_new in candidate_qs:
        q_new_lower = q_new.lower()

        # Check exact duplicates against all known query strings
        is_exact_duplicate = any(q_new_lower == q_old.lower() for q_old in all_qs_strings_for_comparison)
        if is_exact_duplicate:
            continue

        # Calculate similarities against all existing query strings
        similarities = []
        if all_qs_strings_for_comparison:
            for q_old in all_qs_strings_for_comparison:
                score = calculate_similarity(q_new, q_old, scorer)
                # Only store meaningful similarities to avoid clutter
                if score > 0.1: # Threshold to store similarity
                    similarities.append((score, q_old))

        max_similarity = 0.0
        avg_similarity = 0.0
        most_similar_dict = {}
        is_too_similar = False

        if similarities:
            # Calculate max and average only on stored similarities
            scores_only = [s[0] for s in similarities]
            max_similarity = max(scores_only) if scores_only else 0.0
            avg_similarity = sum(scores_only) / len(scores_only) if scores_only else 0.0

            # Find top N most similar using heapq from calculated similarities
            top_n = heapq.nlargest(min(top_n_similar, len(similarities)), similarities, key=lambda item: item[0])
            # Store with rounded scores for cleaner output
            most_similar_dict = {q: round(score, 4) for score, q in top_n}

            is_too_similar = max_similarity > similarity_threshold

        if not is_too_similar:
            new_obj = {
                "q": q_new,
                "max_similarity_score": round(max_similarity, 4),
                "avg_similarity_score": round(avg_similarity, 4),
                "most_similar_queries": most_similar_dict
            }
            newly_accepted_query_objects.append(new_obj)
            # Add the new query string to the comparison pool immediately
            all_qs_strings_for_comparison.append(q_new)

    print(f"--- Accepted {len(newly_accepted_query_objects)} new diverse query objects this round ---")
    return newly_accepted_query_objects


# --- Main Execution Logic ---
if __name__ == "__main__":

    accepted_query_objects = load_queries_with_scores(QUERIES_FILENAME)
    initial_query_count = len(accepted_query_objects)
    overall_target = initial_query_count + (NUM_GENERATION_TURNS * QUERIES_TO_GENERATE_PER_TURN)

    print(f"Starting Generation Process.")
    print(f"Initial query objects loaded: {initial_query_count}")
    print(f"Targeting {QUERIES_TO_GENERATE_PER_TURN} new queries per turn for {NUM_GENERATION_TURNS} turns.")
    print(f"Overall target: {overall_target} query objects.")
    print(f"Using Similarity Threshold: {SIMILARITY_THRESHOLD}")
    print("-" * 30)

    total_added_this_session = 0

    for turn in range(1, NUM_GENERATION_TURNS + 1):
        print(f"\n=== Turn {turn}/{NUM_GENERATION_TURNS} ===")
        target_for_this_turn = QUERIES_TO_GENERATE_PER_TURN
        added_in_this_turn = 0
        attempts_this_turn = 0

        # Extract current query strings for prompting
        current_qs_list = [obj['q'] for obj in accepted_query_objects]

        while added_in_this_turn < target_for_this_turn and attempts_this_turn < MAX_ATTEMPTS_PER_TURN:
            attempts_this_turn += 1
            print(f"\n--- Turn {turn} | Attempt {attempts_this_turn}/{MAX_ATTEMPTS_PER_TURN} ---")
            print(f"Current total query objects: {len(accepted_query_objects)}")
            print(f"Goal for this turn: {added_in_this_turn}/{target_for_this_turn} new queries")

            num_needed_for_turn = target_for_this_turn - added_in_this_turn
            num_to_generate_this_attempt = min(REQUEST_BATCH_SIZE_PER_TURN, num_needed_for_turn + 5)

            # Generate candidate query strings
            candidate_qs_strings = generate_candidate_qs_with_llm(
                tool_schemas_json,
                num_to_generate=num_to_generate_this_attempt,
                existing_qs_list=current_qs_list, # Pass only strings for prompt
                llm_client=client
            )

            if not candidate_qs_strings:
                print("LLM did not return any valid candidate queries or an error occurred. Retrying after delay...")
                time.sleep(5)
                continue

            # Filter candidates and create new query objects with scores
            # Pass the full list of objects for accurate similarity calculation context
            new_query_objects = filter_and_score_qs(
                candidate_qs_strings,
                accepted_query_objects, # Compare against all accepted objects so far
                similarity_threshold=SIMILARITY_THRESHOLD,
                top_n_similar=TOP_N_SIMILAR
            )

            # Add the newly accepted query objects
            added_now = 0
            for obj in new_query_objects:
                if added_in_this_turn < target_for_this_turn:
                    accepted_query_objects.append(obj)
                    # Update the list used ONLY for prompting in the next attempt/turn
                    current_qs_list.append(obj['q'])
                    added_in_this_turn += 1
                    added_now += 1
                else:
                    break

            print(f"Accepted {added_now} new diverse query objects in this attempt.")

            if added_now == 0 and attempts_this_turn > 1:
                print("Warning: No new diverse queries accepted in this attempt.")

            if added_in_this_turn >= target_for_this_turn:
                print(f"--- Turn {turn} goal reached ({added_in_this_turn} new queries added). ---")
                break

            time.sleep(1) # Small delay between attempts

        # Save after each turn
        total_added_this_session += added_in_this_turn
        save_queries_with_scores(accepted_query_objects, QUERIES_FILENAME)
        print(f"--- End of Turn {turn}. Total query objects now: {len(accepted_query_objects)}. Added this turn: {added_in_this_turn}. ---")


    print("-" * 30)
    print(f"Generation process completed after {NUM_GENERATION_TURNS} turns.")
    print(f"Total query objects generated or loaded in this session: {len(accepted_query_objects)}")
    print(f"Total new query objects added in this session: {total_added_this_session}")
    print(f"Final results saved to {QUERIES_FILENAME}")

    print("\nFinal list of diverse query objects (showing last added):")
    start_index = max(0, len(accepted_query_objects) - total_added_this_session)
    for i, obj in enumerate(accepted_query_objects):
         marker = "*" if i >= start_index else " "
         similar_dict = obj.get('most_similar_queries', {})
         # Format similar queries for concise printing
         similar_items = [f"'{q[:30]}...':{s:.2f}" for q, s in similar_dict.items()]
         similar_str = ", ".join(similar_items) if similar_items else "{}"

         print(f"{marker} {i+1}. q: \"{obj['q']}\" (MaxS: {obj.get('max_similarity_score', 0):.3f}, AvgS: {obj.get('avg_similarity_score', 0):.3f}, TopSim: {similar_str})")


    # --- Placeholder for Step 2 (remains the same concept) ---
    print("\n--- Placeholder for Step 2: Generating Full Blueprints ---")
    # Iterate through accepted_query_objects, take 'q', generate full Blueprint
    # ...


No existing query file found (diverse_queries_with_scores.json). Starting fresh.
Starting Generation Process.
Initial query objects loaded: 0
Targeting 10 new queries per turn for 3 turns.
Overall target: 30 query objects.
Using Similarity Threshold: 0.7
------------------------------

=== Turn 1/3 ===

--- Turn 1 | Attempt 1/5 ---
Current total query objects: 0
Goal for this turn: 0/10 new queries
--- Calling LLM (deepseek-r1) to generate ~15 queries ---
--- LLM Response Received ---
--- Lines after removing <think> blocks: 16 ---
--- Parsed 15 potentially valid candidate queries ---
--- Filtering 15 candidates for diversity against 0 existing queries ---
--- Accepted 15 new diverse query objects this round ---
Accepted 10 new diverse query objects in this attempt.
--- Turn 1 goal reached (10 new queries added). ---
Saved 10 query objects to diverse_queries_with_scores.json
--- End of Turn 1. Total query objects now: 10. Added this turn: 10. ---

=== Turn 2/3 ===

--- Turn 2 | Attempt


**설명:**

1.  **도구 및 스키마:** 기존 코드를 사용하여 도구(`tools`)와 해당 JSON 스키마(`tool_schemas`, `tool_schemas_json`)를 정의합니다.
2.  **LLM 호출 시뮬레이션:** `dummy_llm_q_generation` 함수는 실제 LLM API 호출을 대체합니다. 실제 구현에서는 이 부분을 사용 중인 LLM 라이브러리(예: OpenAI, Anthropic, Hugging Face 등)의 API 호출 코드로 바꿔야 합니다. 이 더미 함수는 미리 정의된 질문 목록에서 무작위로 선택하고 약간의 중복을 추가하여 필터링 로직을 테스트할 수 있도록 합니다.
3.  **`generate_candidate_qs` 함수:**
    * LLM에게 다양한 `q`를 생성하도록 요청하는 시스템 및 사용자 프롬프트를 구성합니다.
    * `tool_schemas_json`을 컨텍스트로 제공합니다.
    * (선택 사항) 이미 생성된 `q` 샘플을 프롬프트에 포함하여 LLM이 유사한 것을 피하도록 유도합니다.
    * LLM 응답(한 줄에 하나의 쿼리로 가정)을 파싱하여 후보 `q` 목록을 반환합니다.
4.  **`filter_qs_for_diversity` 함수:**
    * `rouge-score` 라이브러리가 있으면 ROUGE-L 점수를 사용하여 Alpaca와 유사하게 유사성을 계산합니다.
    * 라이브러리가 없으면 Python 내장 `difflib.SequenceMatcher`를 사용하여 기본적인 문자열 유사성 비율을 계산합니다 (덜 정교함).
    * 각 후보 `q`를 이전에 수락된 모든 `q`와 비교합니다.
    * 유사성 점수가 설정된 임계값(`similarity_threshold`)보다 낮으면 해당 `q`를 수락합니다.
    * 정확히 동일한 쿼리(대소문자 무시)는 먼저 필터링합니다.
5.  **메인 실행 로직:**
    * 원하는 쿼리 수(`NUM_QUERIES_DESIRED`), LLM 호출당 생성할 후보 수(`REQUEST_BATCH_SIZE`), 유사성 임계값 등을 설정합니다.
    * 루프를 돌면서 `generate_candidate_qs`로 후보를 생성하고 `filter_qs_for_diversity`로 필터링하여 `accepted_qs` 목록을 채웁니다.
    * 무한 루프를 방지하기 위해 최대 시도 횟수(`MAX_ATTEMPTS`)를 설정합니다.
    * 최종적으로 생성된 다양한 `q` 목록을 출력합니다.
6.  **다음 단계:** 주석 처리된 부분처럼, 이 코드에서 생성된 `accepted_qs` 목록의 각 `q`를 원래의 `Blueprint` 생성 로직(예: `structuredOutput` 함수)에 입력으로 사용하여 `a_gt_steps`와 `o_gt`를 포함한 완전한 JSON 객체를 만들 수 있습니다.

**실행 방법:**

1.  `function_schema.py` 파일이 있는지 확인합니다.
2.  (선택 사항) `pip install rouge-score` 를 실행하여 더 나은 필터링을 사용합니다.
3.  `dummy_llm_q_generation` 함수를 실제 LLM API 호출 코드로 교체합니다.
4.  스크립트를 실행합니다.

이제 이 코드를 사용하여 주어진 도구 세트에 대해 다양한 사용자 질문(`q`)을 생성하고, 이를 바탕으로 최종 `Blueprint` 데이터를 구축할 수 있습니다.