In [None]:
import json
import time
import random
import numpy as np
import os
import csv
from datetime import datetime
from openai import OpenAI

# Initialize the API client
client = OpenAI(api_key="YOUR KEY HERE")

# Model settings
model = "gpt-4o"  # Using GPT-4o model

# Meta-rule configuration
ORDINAL_WORDS = ["first", "second", "third", "fourth", "fifth"]
MAX_CONTEXT_SIZE = 3  # Maximum context window

# Question types with meta-rules
QUESTION_TYPES = {
    'cat': {
        'name': 'Category',
        'question': "What category does this belong to?",
        'column': 'cat_answer',
        'rule_number': 1
    },
    'func': {
        'name': 'Function', 
        'question': "What function does this serve or what does it do?",
        'column': 'Func_answer',
        'rule_number': 2
    },
    'ant': {
        'name': 'Antonym',
        'question': "What is the opposite of this?", 
        'column': 'ant_answer',
        'rule_number': 3
    },
    'syn': {
        'name': 'Synonym',
        'question': "What is similar to this?",
        'column': 'syn_answer',
        'rule_number': 4
    },
    'comp': {
        'name': 'Compositional',
        'question': "What larger structure contains this item, what is it composed of, or what is it a key ingredient of?",
        'column': 'comp_answer',
        'rule_number': 5
    }
}

# Meta-rule answers file - you should create this
META_ANSWERS_FILE = 'meta_rule_answers.json'

def create_system_prompt():
    """Create the system prompt with all rule explanations and meta-rules"""
    prompt = """**Rule Explanations:**
**Rule 1 (Category):** What broader category does the term belong to? (e.g., Moose -> Animal; Persimmon -> Fruit; Stethoscope -> Medical Device)
**Rule 2 (Function):** What action or purpose is associated with the term? (e.g., Moose -> Graze; Persimmon -> Eat; Stethoscope -> Listen)
**Rule 3 (Antonym):** What is the opposite or contrasting concept? (e.g., Moose -> Mouse; Persimmon -> Vinegar; Stethoscope -> Earplug)
**Rule 4 (Synonym):** What is similar to or can substitute for the term? (e.g., Moose -> Elk; Persimmon -> Kaki; Stethoscope -> Phonendoscope)
**Rule 5 (Compositional):** What larger structure contains this item, what is it composed of, or what is it a key ingredient of? (e.g., Moose -> Herd; Persimmon -> Tart; Stethoscope -> Hospital)

The base rule of the game is rule 1. Unless otherwise specified, this is the rule you apply.
**Base Rule:** Use **Rule 1** as the default unless instructed otherwise by the meta-rules.
**Meta rule 1**: When you encounter an **ordinal number**, **switch the base rule** to the corresponding numbered rule ("first" apply rule 1, "second" apply rule 2, "third" apply rule 3, "fourth" apply rule 4, "fifth" apply rule 5). Apply this new rule from that point onward until another ordinal number appears, at which point you update the base rule again.

Answer with the letter of the correct option (A, B, C, D, or E)."""
    
    return prompt

def load_meta_answers():
    """Load predefined answers for ordinal words under different rules"""
    try:
        with open(META_ANSWERS_FILE, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        # Return default structure if file doesn't exist
        return {
            "first": {"cat": "Ordinal", "func": "Initialize", "ant": "Last", "syn": "Initial", "comp": "Sequence"},
            "second": {"cat": "Ordinal", "func": "Follow", "ant": "First", "syn": "Next", "comp": "Sequence"}, 
            "third": {"cat": "Ordinal", "func": "Continue", "ant": "Second", "syn": "Another", "comp": "Series"},
            "fourth": {"cat": "Ordinal", "func": "Proceed", "ant": "Third", "syn": "Following", "comp": "Order"},
            "fifth": {"cat": "Ordinal", "func": "Complete", "ant": "Fourth", "syn": "Final", "comp": "Progression"}
        }

def load_data(file_path):
    """Load the analogy items from CSV"""
    data = []
    try:
        with open(file_path, 'r', encoding='latin1') as f:
            reader = csv.DictReader(f, delimiter=';')
            for row in reader:
                clean_row = {}
                for key, value in row.items():
                    clean_key = key.replace('\ufeff', '')
                    clean_row[clean_key] = value
                
                if 'term' in clean_row and clean_row['term'].strip():
                    data.append(clean_row)
        print(f"Successfully loaded {len(data)} items from CSV")
    except Exception as e:
        print(f"Error reading CSV: {e}")
        raise
    return data

def insert_ordinal_word(term, probability=0.1):
    """Randomly replace the term with 1 ordinal word with given probability -> change probability here, currently 10%"""
    if random.random() > probability:
        return term, []
    
    # Select exactly 1 ordinal word to replace the entire term
    selected_word = random.choice(ORDINAL_WORDS)
    
    return selected_word, [selected_word]

def determine_active_rule(inserted_words, current_rule):
    """Determine which rule should be active based on inserted ordinal words"""
    if not inserted_words:
        return current_rule
    
    # Use the last ordinal word to determine the rule
    rule_mapping = {
        "first": "cat",    # Rule 1
        "second": "func",  # Rule 2
        "third": "ant",    # Rule 3
        "fourth": "syn",   # Rule 4
        "fifth": "comp"    # Rule 5
    }
    
    return rule_mapping.get(inserted_words[-1], current_rule)

def get_correct_answer_for_ordinal(word, rule_type, meta_answers):
    """Get the correct answer for an ordinal word under a specific rule"""
    if word in meta_answers and rule_type in meta_answers[word]:
        return meta_answers[word][rule_type]
    
    # Fallback answers
    fallback = {
        "first": "Beginning",
        "second": "Following", 
        "third": "Middle",
        "fourth": "Continuing",
        "fifth": "Ending"
    }
    return fallback.get(word, word.capitalize())

def prepare_question(item, q_type, meta_answers, inserted_words=None):
    """Prepare a question, handling both regular terms and ordinal words"""
    if inserted_words:
        # For ordinal words, use meta-rule answers
        term = item['term'].strip()
        
        # Get answers for all ordinal words in the term
        all_answers = []
        for word in inserted_words:
            for rule in ['cat', 'func', 'ant', 'syn', 'comp']:
                answer = get_correct_answer_for_ordinal(word, rule, meta_answers)
                if answer not in all_answers:
                    all_answers.append(answer)
        
        # Add some regular answers to fill out options
        regular_answers = [
            item['cat_answer'].strip(),
            item['Func_answer'].strip(), 
            item['ant_answer'].strip(),
            item['syn_answer'].strip(),
            item['comp_answer'].strip()
        ]
        
        for ans in regular_answers:
            if ans not in all_answers and len(all_answers) < 5:
                all_answers.append(ans)
        
        # Ensure we have exactly 5 options
        while len(all_answers) < 5:
            all_answers.append(f"Option{len(all_answers)+1}")
        
        # Get the correct answer for the active rule
        correct_answer = get_correct_answer_for_ordinal(inserted_words[-1], q_type, meta_answers)
        
        # Ensure correct answer is in the list
        if correct_answer not in all_answers:
            all_answers[0] = correct_answer
        
        random.shuffle(all_answers)
        correct_index = all_answers.index(correct_answer)
        
    else:
        # Regular processing for normal terms
        column_name = QUESTION_TYPES[q_type]['column']
        correct_answer = item[column_name].strip()
        
        all_answers = [
            item['cat_answer'].strip(),
            item['Func_answer'].strip(),
            item['ant_answer'].strip(), 
            item['syn_answer'].strip(),
            item['comp_answer'].strip()
        ]
        
        random.shuffle(all_answers)
        correct_index = all_answers.index(correct_answer)
    
    return {
        'term': item['term'].strip(),
        'question_type': q_type,
        'question_text': QUESTION_TYPES[q_type]['question'],
        'question_name': QUESTION_TYPES[q_type]['name'],
        'rule_number': QUESTION_TYPES[q_type]['rule_number'],
        'answers': all_answers,
        'correct_answer': correct_answer,
        'correct_index': correct_index,
        'has_ordinal': bool(inserted_words),
        'inserted_words': inserted_words or []
    }

def check_correctness(response, correct_letter):
    """Check if the response contains the correct letter"""
    for i, char in enumerate(response):
        if char == correct_letter:
            if i == 0 or not response[i-1].isalpha():
                if i == len(response) - 1 or not response[i+1].isalpha():
                    return True
    return False

def manage_context(context, new_user_msg, new_assistant_msg, max_size=8, has_ordinal=False):
    """Manage context window size, keeping system message, latest ordinal rule, and recent exchanges"""
    # Add new messages
    context.append({"role": "user", "content": new_user_msg})
    context.append({"role": "assistant", "content": new_assistant_msg})
    
    # If we exceed max size, intelligently manage what to keep
    if len(context) > max_size:
        system_msg = context[0] if context and context[0]["role"] == "system" else None
        
        # Find the most recent message pair that contains an ordinal word
        latest_ordinal_pair = None
        latest_ordinal_index = -1
        
        # Search backwards through user messages for ordinal words
        for i in range(len(context) - 2, 0, -2):  # Step by 2, looking at user messages only
            if i < len(context) and context[i]["role"] == "user":
                user_msg = context[i]["content"].lower()
                if any(word in user_msg for word in ORDINAL_WORDS):
                    latest_ordinal_pair = (context[i], context[i+1])  # user + assistant pair
                    latest_ordinal_index = i
                    break
        
        # If we have ordinal words to preserve
        if latest_ordinal_pair:
            # Keep: system message + latest ordinal pair + most recent exchanges
            other_messages = []
            for i, msg in enumerate(context[1:], 1):  # Skip system message
                # Skip the ordinal pair we're preserving
                if i != latest_ordinal_index and i != latest_ordinal_index + 1:
                    other_messages.append(msg)
            
            # Calculate how many recent messages we can keep
            reserved_slots = 1 + 2  # system + ordinal pair
            available_slots = max_size - reserved_slots
            recent_messages = other_messages[-available_slots:] if available_slots > 0 else []
            
            # Reconstruct context: system + ordinal pair + recent messages
            context = [system_msg] + list(latest_ordinal_pair) + recent_messages
        else:
            # No ordinal words found, use normal context management
            if system_msg:
                recent_messages = context[-(max_size-1):]
                context = [system_msg] + recent_messages
            else:
                context = context[-max_size:]
    
    return context

def main():
    # Load data and meta answers
    analogy_items = load_data('analogy_items.csv')
    meta_answers = load_meta_answers()
    
    # Test configuration
    base_rule = 'cat'  # Default rule (Rule 1)
    current_rule = base_rule
    num_trials = min(400, len(analogy_items))
    
    # Initialize context with system message only
    system_prompt = create_system_prompt()
    context = [{"role": "system", "content": system_prompt}]
    
    # Results tracking
    all_results = []
    correct_answers = 0
    incorrect_answers = 0
    
    print(f"\nTesting word game with meta-rules on {num_trials} items...")
    print(f"Starting with base rule: Rule {QUESTION_TYPES[base_rule]['rule_number']} ({QUESTION_TYPES[base_rule]['name']})")
    
    for i in range(num_trials):
        # Select random item
        item = random.choice(analogy_items)
        
        # Maybe insert ordinal words
        original_term = item['term'].strip()
        modified_term, inserted_words = insert_ordinal_word(original_term)
        
        # Update current rule based on ordinal words
        if inserted_words:
            current_rule = determine_active_rule(inserted_words, current_rule)
            rule_num = QUESTION_TYPES[current_rule]['rule_number']
            print(f"\n🔄 Meta-rule activation! '{inserted_words[-1]}' detected -> Switch to Rule {rule_num} ({QUESTION_TYPES[current_rule]['name']})")
        
        # Create modified item for question preparation
        modified_item = item.copy()
        modified_item['term'] = modified_term
        
        # Prepare question
        question_data = prepare_question(modified_item, current_rule, meta_answers, inserted_words)
        
        term = question_data['term']
        question_text = question_data['question_text']
        answers = question_data['answers']
        correct_answer = question_data['correct_answer']
        correct_index = question_data['correct_index']
        correct_letter = chr(65 + correct_index)
        rule_number = question_data['rule_number']
        
        # Format answers
        formatted_answers = "\n".join([f"{chr(65+j)}. {answer}" for j, answer in enumerate(answers)])
        
        print(f"\n----- Trial {i+1}/{num_trials} -----")
        print(f"Term: {term}")
        if inserted_words:
            print(f"Ordinal words detected: {inserted_words}")
        print(f"Active rule: Rule {rule_number} ({QUESTION_TYPES[current_rule]['name']})")
        print(f"Question: {question_text}")
        print(f"Options:\n{formatted_answers}")
        print(f"Correct answer: {correct_letter}. {correct_answer}")
        
        # Create user prompt - NO meta-rule explanation in user prompt, only the question
        user_prompt = f"Term: {term}\n"
        user_prompt += f"Options:\n{formatted_answers}\n\n"
        user_prompt += f"Answer with just the letter (A, B, C, D, or E)."
        
        # Create result object
        result = {
            "trial_num": i+1,
            "original_term": original_term,
            "modified_term": term,
            "inserted_words": inserted_words,
            "active_rule": current_rule,
            "rule_number": rule_number,
            "question_type": current_rule,
            "question_name": question_data['question_name'],
            "options": answers,
            "correct_answer": correct_answer,
            "correct_letter": correct_letter,
            "context_length": len(context),
            "timestamp": datetime.now().isoformat()
        }
        
        # Call OpenAI GPT-4o API
        try:
            print("Calling GPT-4o API...")
            start_time = time.time()
            
            # Prepare messages for GPT-4o
            current_messages = context + [{"role": "user", "content": user_prompt}]
            
            # Call OpenAI API
            response = client.chat.completions.create(
                model=model,
                messages=current_messages,
                max_tokens=100,
                temperature=0
            )
            
            end_time = time.time()
            response_time = end_time - start_time
            response_text = response.choices[0].message.content
            
            print(f"GPT-4o Response: {response_text}")
            
            # Check correctness
            is_correct = check_correctness(response_text, correct_letter)
            
            # Update result
            result.update({
                "response": response_text,
                "response_time": response_time,
                "is_correct": is_correct
            })
            
            if is_correct:
                print(f"✓ Correct! Selected {correct_letter}.")
                correct_answers += 1
            else:
                print(f"✗ Incorrect. The correct answer is: {correct_letter}. {correct_answer}")
                incorrect_answers += 1
            
            # Update context with sliding window, preserving ordinal words
            context = manage_context(context, user_prompt, response_text, MAX_CONTEXT_SIZE, bool(inserted_words))
            
        except Exception as e:
            error_msg = str(e)
            print(f"Error: {error_msg}")
            result["error"] = error_msg
            result["is_correct"] = False
            incorrect_answers += 1
        
        all_results.append(result)
        time.sleep(0.5)  # Rate limiting for OpenAI API (more lenient than others)
    
    # Print final results
    print("\n===== Final Results =====")
    print(f"Total trials: {num_trials}")
    print(f"Correct answers: {correct_answers} ({correct_answers/num_trials*100:.1f}%)")
    print(f"Incorrect answers: {incorrect_answers} ({incorrect_answers/num_trials*100:.1f}%)")
    
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = "results"
    os.makedirs(results_dir, exist_ok=True)
    
    # Save JSON
    json_filename = f"{results_dir}/gpt4o_word_game_meta_{timestamp}.json"
    with open(json_filename, 'w') as f:
        json.dump(all_results, f, indent=2)
    print(f"Results saved to {json_filename}")
    
    # Save NPZ
    npz_filename = f"{results_dir}/gpt4o_word_game_meta_{timestamp}.npz"
    np.savez(
        npz_filename,
        trial_nums=np.array([r["trial_num"] for r in all_results]),
        correctness=np.array([1 if r["is_correct"] else 0 for r in all_results]),
        context_lengths=np.array([r["context_length"] for r in all_results]),
        response_times=np.array([r.get("response_time", 0) for r in all_results]),
        has_ordinals=np.array([1 if r["inserted_words"] else 0 for r in all_results]),
        accuracy=correct_answers/num_trials,
        timestamp=timestamp
    )
    print(f"Results also saved to {npz_filename}")
    print("\nDone testing!")

if __name__ == "__main__":
    main()