In [None]:
import json
import os
from transformers import AutoTokenizer

# --- CONFIGURATION ---

# Folder where your 4 JSON files are stored
folder_path = "./"

# Model tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")  # Example: "Qwen/Qwen2-7B-Instruct"

# --- FUNCTION DEFINITIONS ---

def extract_response(text):
    """
    Extract the response part after the second occurrence of 'Solve step by step' or 'Question'.
    """
    # Find the last "Solve step by step" (this was copy-pasted twice in your example)
    if "Solve step by step" in text:
        parts = text.split("Solve step by step")
        if len(parts) > 2:
            text = "Solve step by step" + parts[-1]

    # Now find after "Question:" (your real start)
    if "Question:" in text:
        question_split = text.split("Question:", 1)
        if len(question_split) == 2:
            after_question = question_split[1]

            # Now find after "Solve step by step" again
            if "Solve step by step" in after_question:
                response_split = after_question.split("Solve step by step", 1)
                if len(response_split) == 2:
                    return "Solve step by step" + response_split[1]
    
    # If not clear, fallback to original full text
    return text

def measure_token_length(text):
    """
    Measure token length of given text.
    """
    tokens = tokenizer(text, add_special_tokens=False, return_tensors="pt")
    return tokens.input_ids.shape[-1]

# --- MAIN LOGIC ---

response_lengths = []
file_report = {}

for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        total_tokens = 0
        count = 0

        for item in data:
            model_output = item["model_output"]

            # Extract only the response part
            response = extract_response(model_output)

            # Measure token length
            token_length = measure_token_length(response)

            total_tokens += token_length
            response_lengths.append(token_length)
            count += 1

        avg_tokens = total_tokens / count if count > 0 else 0
        file_report[filename] = {
            "total_items": count,
            "total_response_tokens": total_tokens,
            "avg_tokens_per_response": avg_tokens,
        }

# --- PRINT FINAL REPORT ---

print("\n--- Token Length Report ---\n")
for filename, stats in file_report.items():
    print(f"File: {filename}")
    print(f"  Total Items: {stats['total_items']}")
    print(f"  Total Response Tokens: {stats['total_response_tokens']}")
    print(f"  Average Tokens per Response: {stats['avg_tokens_per_response']:.2f}\n")

overall_avg = sum(response_lengths) / len(response_lengths) if response_lengths else 0
print(f"Overall Average Tokens per Response: {overall_avg:.2f}")
