In [None]:
#New Openai (Few shot/Nona)
import json
import time
import random
import numpy as np
import os
import csv
from datetime import datetime
from openai import OpenAI  # Using OpenAI's API

# Initialize the OpenAI API client
client = OpenAI(api_key="YOUR KEY HERE") 

# Model settings
model = "gpt-4o"

# Set the specific question type we want to test
question_type = 'cat'
# Options: 'cat', 'func', 'ant', 'syn', 'caus', 'comp'

# Define the question types with descriptions for the context
QUESTION_TYPES = {
    'cat': {
        'name': 'Category',
        'question': "What category does this belong to?",
        'column': 'cat_answer',
        'explanation': "Category questions ask about the broader category the term belongs to. (e.g., Moose -> Animal; Persimmon -> Fruit; Stethoscope -> Medical Device)"
    },
    'func': {
        'name': 'Function',
        'question': "What function does this serve or what does it do?",
        'column': 'Func_answer',
        'explanation': "Function questions ask about the action or purpose associated with the term. (e.g., Moose -> Graze; Persimmon -> Eat; Stethoscope -> Listen)."
    },
    'ant': {
        'name': 'Antonym',
        'question': "What is the opposite of this?",
        'column': 'ant_answer',
        'explanation': "Antonym questions ask about the opposite or contrasting concept. (e.g., Moose -> Mouse; Persimmon -> Vinegar; Stethoscope -> Earplug)"
    },
    'syn': {
        'name': 'Synonym',
        'question': "What is similar to this?",
        'column': 'syn_answer',
        'explanation': "Synonym questions ask about what is similar to or can substitute for the term. (e.g., Moose -> Elk; Persimmon -> Kaki; Stethoscope -> Phonendoscope)"
    },
    'comp': {
        'name': 'Compositional',
        'question': "What larger structure contains this item, what is it composed of, or what is it a key ingredient of?",
        'column': 'comp_answer',
        'explanation': "Compositional questions ask about what larger structure contains this item, what is it composed of, or what is it a key ingredient of. (e.g., Moose -> Herd; Persimmon -> Tart; Stethoscope -> Hospital)"
    }
}

# Load the analogy items from CSV
def load_data(file_path):
    data = []
    try:
        with open(file_path, 'r', encoding='latin1') as f:
            reader = csv.DictReader(f, delimiter=';')
            for row in reader:
                clean_row = {}
                for key, value in row.items():
                    clean_key = key.replace('\ufeff', '')
                    clean_row[clean_key] = value
                if 'term' in clean_row and clean_row['term'].strip():
                    data.append(clean_row)
        print(f"Successfully loaded {len(data)} items from CSV")
    except Exception as e:
        print(f"Error reading CSV: {e}")
        raise
    return data

# Load the dataset
analogy_items = load_data('analogy_items.csv')
print(f"Loaded dataset with {len(analogy_items)} items")

# Randomly select items from the dataset
num_trials = min(400, len(analogy_items))
selected_indices = random.sample(range(len(analogy_items)), num_trials)
selected_items = [analogy_items[i] for i in selected_indices]

# Function to prepare a specific question
def prepare_question(item, q_type):
    column_name = QUESTION_TYPES[q_type]['column']
    correct_answer = item[column_name].strip()
    all_answers = [
        item['cat_answer'].strip(),
        item['Func_answer'].strip(),
        item['ant_answer'].strip(),
        item['syn_answer'].strip(),
        item['comp_answer'].strip()
    ]
    random.shuffle(all_answers)
    correct_index = all_answers.index(correct_answer)
    return {
        'term': item['term'].strip(),
        'question_type': q_type,
        'question_text': QUESTION_TYPES[q_type]['question'],
        'question_name': QUESTION_TYPES[q_type]['name'],
        'answers': all_answers,
        'correct_answer': correct_answer,
        'correct_index': correct_index
    }

# Add static few-shot examples with shared shuffle per term
def get_few_shot_examples():
    moose_options = ["Mouse", "Animal", "Herd", "Elk", "Graze"]
    persimmon_options = ["Fruit", "Tart", "Vinegar", "Kaki", "Eat"]

    def format_block(term, question, options, correct):
        letter = chr(65 + options.index(correct))
        opts_str = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(options)])
        return f"Term: {term}\nQuestion: {question}\n{opts_str}\nAnswer: {letter}"

    examples = [
        format_block("Moose", "What category does this belong to?", moose_options, "Animal"),
        format_block("Moose", "What function does this serve or what does it do?", moose_options, "Graze"),
        format_block("Moose", "What is the opposite of this?", moose_options, "Mouse"),
        format_block("Moose", "What is similar to this?", moose_options, "Elk"),
        format_block("Moose", "What larger structure contains this item, what is it composed of, or what is it a key ingredient of?", moose_options, "Herd"),
        format_block("Persimmon", "What category does this belong to?", persimmon_options, "Fruit"),
        format_block("Persimmon", "What function does this serve or what does it do?", persimmon_options, "Eat"),
        format_block("Persimmon", "What is the opposite of this?", persimmon_options, "Vinegar"),
        format_block("Persimmon", "What is similar to this?", persimmon_options, "Kaki"),
        format_block("Persimmon", "What larger structure contains this item, what is it composed of, or what is it a key ingredient of?", persimmon_options, "Tart")
    ]
    return "\n\n".join(examples)

# Check correctness
def check_correctness(response, correct_letter):
    is_correct = False
    for i, char in enumerate(response):
        if char == correct_letter:
            if i == 0 or not response[i-1].isalpha():
                if i == len(response) - 1 or not response[i+1].isalpha():
                    is_correct = True
                    break
    return is_correct

# Results storage
all_results = []
correct_answers = 0
incorrect_answers = 0

results_dir = "results"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

print(f"\nTesting '{QUESTION_TYPES[question_type]['name']}' questions on {num_trials} randomly selected items...")

for i, item in enumerate(selected_items):
    question_data = prepare_question(item, question_type)
    term = question_data['term']
    question_text = question_data['question_text']
    answers = question_data['answers']
    correct_answer = question_data['correct_answer']
    correct_index = question_data['correct_index']
    correct_letter = chr(65 + correct_index)
    formatted_answers = "\n".join([f"{chr(65+j)}. {answer}" for j, answer in enumerate(answers)])

    print(f"\n----- Trial {i+1}/{num_trials} -----")
    print(f"Term: {term}")
    print(f"Question: {question_text}")
    print(f"Options:\n{formatted_answers}")
    print(f"Correct answer: {correct_letter}. {correct_answer}")

    system_prompt = "You are an AI trained to answer analogy questions."
    system_prompt += "1. Category: What broader category does the term belong to? (e.g., Moose -> Animal; Persimmon -> Fruit; Stethoscope -> Object)\n"
    system_prompt += "2. Function: What action or purpose is associated with the term? (e.g., Moose -> Graze; Persimmon -> Eat; Stethoscope -> Listen)\n"
    system_prompt += "3. Antonym: What is the opposite or contrasting concept? (e.g., Moose -> Mouse; Persimmon -> Vinegar; Stethoscope -> Earplug)\n"
    system_prompt += "4. Synonym: What is similar to or can substitute for the term? (e.g., Moose -> Elk; Persimmon -> Kaki; Stethoscope -> Phonendoscope)\n"
    system_prompt += "5. Compositional: What larger structure contains this item, what is it composed of, or what is it a key ingredient of? (e.g., Moose -> Herd; Persimmon -> Tart; Stethoscope -> Hospital)\n\n"
    system_prompt += QUESTION_TYPES[question_type]['explanation'] + " "
    system_prompt += "Answer with the letter of the correct option (A, B, C, D, or E)."

    # Include few-shot examples here
    few_shot_text = get_few_shot_examples()
    user_prompt = few_shot_text + "\n\n" + f"Term: {term}\nQuestion: {question_text}\n\nOptions:\n{formatted_answers}\n"

    current_messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    result = {
        "trial_num": i+1,
        "term": term,
        "question_type": question_type,
        "question_name": QUESTION_TYPES[question_type]['name'],
        "options": answers,
        "correct_answer": correct_answer,
        "correct_letter": correct_letter,
        "timestamp": datetime.now().isoformat()
    }

    try:
        print("Calling API...")
        start_time = time.time()

        response = client.chat.completions.create(
            model=model,
            messages=current_messages,
            temperature=0,
            max_tokens=100
        )

        end_time = time.time()
        response_time = end_time - start_time

        response_text = response.choices[0].message.content
        print(f"AI Response: {response_text}")

        is_correct = check_correctness(response_text, correct_letter)

        result["response"] = response_text
        result["response_time"] = response_time
        result["is_correct"] = is_correct

        if is_correct:
            print(f"✓ Correct! Selected {correct_letter}.")
            correct_answers += 1
        else:
            print(f"✗ Incorrect. The correct answer is: {correct_letter}. {correct_answer}")
            incorrect_answers += 1

    except Exception as e:
        error_msg = str(e)
        print(f"Error: {error_msg}")
        result["error"] = error_msg
        result["is_correct"] = False
        incorrect_answers += 1

    all_results.append(result)
    time.sleep(1.5)

print("\n===== Results =====")
print(f"Total trials: {num_trials}")
print(f"Correct answers: {correct_answers} ({correct_answers/num_trials*100:.1f}%)")
print(f"Incorrect answers: {incorrect_answers} ({incorrect_answers/num_trials*100:.1f}%)")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
json_filename = f"{results_dir}/independent_analogy_responses_{question_type}_{timestamp}.json"
with open(json_filename, 'w') as f:
    json.dump(all_results, f, indent=2)
print(f"Results saved to {json_filename}")

npz_filename = f"{results_dir}/independent_analogy_responses_{question_type}_{timestamp}.npz"
trial_nums = np.array([r["trial_num"] for r in all_results])
correctness = np.array([1 if r["is_correct"] else 0 for r in all_results])
response_times = np.array([r.get("response_time", 0) for r in all_results])
np.savez(
    npz_filename,
    trial_nums=trial_nums,
    correctness=correctness,
    response_times=response_times,
    question_type=question_type,
    accuracy=correct_answers/num_trials,
    timestamp=timestamp
)
print(f"Results also saved to {npz_filename}")

print("\nDone testing!")
