In [None]:
import json
import time
import random
import numpy as np
import os
from datetime import datetime
from openai import OpenAI

# Initialize the API client
client = OpenAI(api_key="YOUR KEY HERE")

# Model settings
model = "o3-2025-04-16"  

# Set the specific transformation type we want to test
transform_type = 'succ'
#['succ', 'pred', 'add_letter', 'remove_redundant', 'counting']

# Load the dataset
with open('alphabet_dataset.json', 'r') as f:
    full_dataset = json.load(f)

print(f"Loaded dataset with {len(full_dataset)} items")

# Randomly select items from the dataset
num_trials = 400
selected_indices = random.sample(range(len(full_dataset)), num_trials)
selected_items = [full_dataset[i] for i in selected_indices]

# Function to format a letter sequence for display
def format_sequence(seq):
    if isinstance(seq, list):
        return "[" + " ".join(seq) + "]"
    elif isinstance(seq, str):
        return seq
    else:
        return str(seq)

# Correctness checking
def check_correctness(response, expected_str, transform_type, input_str=None):
    """
    Check if the model's response contains the correct answer by looking for the
    "Answer: " pattern and comparing the provided answer with the expected output.
    
    Args:
        response (str): The full response from the model
        expected_str (str): The expected output string
        transform_type (str): The type of transformation applied
        input_str (str, optional): The original input string
        
    Returns:
        bool: True if the expected output matches the answer, False otherwise
    """
    # Handle empty responses
    if not response or not expected_str:
        return False
    
    # Try to extract the result after "Answer: " if present
    extracted_answer = None
    if "answer:" in response.lower():
        # Split by "Answer:" and take the content after it
        parts = response.lower().split("answer:")
        if len(parts) > 1:
            extracted_answer = parts[1].strip()
            
            # If there are multiple lines after "Answer:", take just the first line
            lines = extracted_answer.split('\n')
            extracted_answer = lines[0].strip()
    
    # If we couldn't extract an answer using the marker, return False
    if not extracted_answer:
        return False
    
    # Clean both strings for comparison
    clean_answer = extracted_answer.replace('[', '').replace(']', '').replace('"', '').replace("'", '').strip()
    clean_expected = expected_str.replace('[', '').replace(']', '').replace('"', '').replace("'", '').strip()
    
    # For "remove_redundant" transformation - must be exact match, not substring
    if transform_type == 'remove_redundant':
        # Remove spaces for comparison
        no_space_answer = clean_answer.replace(' ', '')
        no_space_expected = clean_expected.replace(' ', '')
        
        # Check if the answer is exactly the expected result
        return no_space_answer.lower() == no_space_expected.lower()
    
    # For counting transformation, extract digits
    if transform_type == 'counting':
        # If expected is a number, extract numbers from the answer
        if clean_expected.isdigit():
            import re
            numbers = re.findall(r'\d+', clean_answer)
            return clean_expected in numbers
    
    # For other transformations
    # Remove all spaces for comparison
    no_space_answer = clean_answer.replace(' ', '')
    no_space_expected = clean_expected.replace(' ', '')
    
    # Exact match after normalizing spaces
    if no_space_answer.lower() == no_space_expected.lower():
        return True
        
    # Special case for sequences where spaces matter
    spaced_expected = ' '.join(list(no_space_expected.lower()))
    if clean_answer.lower() == spaced_expected:
        return True
    
    return False



# Prepare the explanation - prompt type 1
transformation_explanations = {
    'succ': "Successorship (succ): The last letter changes to the next letter in the alphabet (e.g., abb -> abc; moose -> moosf; vwxyz -> vwxya)",
    'pred': "Predecessorship (pred): The first letter changes to the previous letter in the alphabet (e.g., abb -> zbb; moose -> loose; vwxyz -> uwxyz)",
    'add_letter': "Adding a letter (add_letter): The next letter in the alphabet after the last letter is added to the end of the sequence (e.g., abb -> abbc; moose -> moosef; vwxyz -> vwxyza)",
    'remove_redundant': "Removing redundant character (remove_redundant): If there are duplicate letters, the first duplicate found is removed (e.g., abb -> ab; moose -> mose; vwxyz -> vwxyz)",
    'counting': "Counting (counting): Count the number of letters in the sequence (e.g., abb -> 3; moose -> 5; vwxyz -> 5)"
}
#Explanation for prompt type 2:
'''
transformation_explanations = {
    'succ': "Successorship:  e.g., abb -> abc; moose -> moosf; vwxyz -> vwxya",
    'pred': "Predecessorship: e.g., abb -> zbb; moose -> loose; vwxyz -> uwxyz",
    'add_letter': "Adding letter: e.g., abb -> abbc; moose -> moosef; vwxyz -> vwxyza",
    'remove_redundant': "Removing redundant: e.g., abb -> ab; moose -> mose; vwxwyz -> vwxyz",
    'counting': "Counting: e.g., abb -> 3; moose -> 5; vwxyz -> 5"
}'''

# Create a list to store all results
all_results = []

# Progress tracking
correct_answers = 0
incorrect_answers = 0

# Create results directory if it doesn't exist
results_dir = "results"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# Run through the trials
print(f"\nTesting '{transform_type}' transformation on {num_trials} items (each independently)...")

for i, item in enumerate(selected_items):
    # Get the input and expected output
    input_data = item['input']
    expected_output = item['transformations'][transform_type]
    
    # Format for display
    input_str = format_sequence(input_data)
    expected_str = format_sequence(expected_output)
    
    print(f"\n----- Trial {i+1}/{num_trials} -----")
    print(f"Input: {input_str}")
    print(f"Expected output: {expected_str}")
    
    # Each trial is independent - no conversation history
    user_prompt = f"Apply the following transformation:\n\n{transformation_explanations[transform_type]}\n\nAt the end of your response, please provide your final answer in this exact format: Answer: [transformed result]"
    
    # Add the current task to the prompt
    current_task = f"Transform the following: {input_str}"
    
    # Prepare message
    messages = [
        {"role": "user", "content": user_prompt + "\n\n" + current_task}
    ]
    
    # Create a result object for this trial
    result = {
        "trial_num": i+1,
        "input": input_data,
        "expected_output": expected_output,
        "input_str": input_str,
        "expected_str": expected_str,
        "transform_type": transform_type,
        "is_duplicated": item.get('has_duplicates', False),
        "data_type": "word" if isinstance(input_data, str) else "sequence",
        "timestamp": datetime.now().isoformat()
    }
    
    # Call the API
    try:
        print("Calling API...")
        start_time = time.time()
        completion = client.chat.completions.create(
            model=model,
            messages=messages,
            #max_tokens=1000,
            max_completion_tokens = 1000,
            temperature=1
        )
        end_time = time.time()
        response_time = end_time - start_time
        
        response = completion.choices[0].message.content
        print(f"AI Response: {response}")
        
        # Check if the response contains the expected output using our checker
        is_correct = check_correctness(response, expected_str, transform_type, input_str)
        
        # Add response details to the result
        result["response"] = response
        result["response_time"] = response_time
        result["is_correct"] = is_correct
        
        if is_correct:
            print("✓ Correct!")
            correct_answers += 1
        else:
            print(f"✗ Incorrect. The correct answer is: {expected_str}")
            incorrect_answers += 1
        
    except Exception as e:
        error_msg = str(e)
        print(f"Error: {error_msg}")
        result["error"] = error_msg
        result["is_correct"] = False
        incorrect_answers += 1
    
    # Add the result to our collection
    all_results.append(result)
    
    # Brief pause between API calls to avoid rate limiting
    time.sleep(1.5)

# Print results
print("\n===== Results =====")
print(f"Total trials: {num_trials}")
print(f"Correct answers: {correct_answers} ({correct_answers/num_trials*100:.1f}%)")
print(f"Incorrect answers: {incorrect_answers} ({incorrect_answers/num_trials*100:.1f}%)")

# Create a timestamp for the filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save results to JSON
json_filename = f"{results_dir}/openaio3_full_explanation_{transform_type}_{timestamp}.json"
with open(json_filename, 'w') as f:
    json.dump(all_results, f, indent=2)
print(f"Results saved to {json_filename}")

# Save results to NPZ
npz_filename = f"{results_dir}/openaio3_full_explanation_{transform_type}_{timestamp}.npz"

# Convert the results to a format suitable for NPZ
# We'll create arrays of the key metrics
trial_nums = np.array([r["trial_num"] for r in all_results])
correctness = np.array([1 if r["is_correct"] else 0 for r in all_results])
response_times = np.array([r.get("response_time", 0) for r in all_results])
data_types = np.array([1 if r["data_type"] == "word" else 0 for r in all_results])  # 1 for word, 0 for sequence
has_duplicates = np.array([1 if r["is_duplicated"] else 0 for r in all_results])

# Save arrays to NPZ file
np.savez(
    npz_filename, 
    trial_nums=trial_nums,
    correctness=correctness,
    response_times=response_times,
    data_types=data_types,
    has_duplicates=has_duplicates,
    transform_type=transform_type,
    accuracy=correct_answers/num_trials,
    timestamp=timestamp
)
print(f"Results also saved to {npz_filename}")

print("\nDone testing!")