# Script for checking the answers generated by models
***

## Answer checking functions

In [31]:
import os
import json
import pandas as pd
import re
import json
import ast
import requests
from rdkit import Chem
import numpy as np

def process_answer(answer):
    if not isinstance(answer, str):
        return answer

    # Check for <answer> tags and extract the content if available.
    match = re.search(r"<answer>(.*?)</answer>", answer, re.DOTALL)
    if match:
        answer = match.group(1)
    
    # Clean the answer by stripping whitespace and removing a trailing period.
    answer = answer.strip().rstrip('.')
    
    # Attempt to convert the cleaned answer to a Python literal.
    try:
        return ast.literal_eval(answer)
    except Exception:
        return answer

def check_exact_match(chat_answer, question_data):
    return str(chat_answer) == str(question_data.get("answer"))

def check_list_of_tuples(chat_answer, question_data):
    """
    Verifies that chat_answer matches the expected list of tuples.
    
    The expected answer can be provided as a string or as a list/tuple.
    Each element is normalized to a tuple if it is a list.
    """
    # Helper function to normalize an item to a tuple if possible.
    def normalize(item):
        return tuple(item) if isinstance(item, list) else item

    # Process the expected answer.
    expected_raw = question_data.get("answer")
    if isinstance(expected_raw, str):
        try:
            expected_list = ast.literal_eval(expected_raw)
        except Exception as e:
            raise Exception(f"Error evaluating expected answer for uuid {question_data.get('uuid', 'unknown')}: {e}")
    else:
        expected_list = expected_raw

    if not isinstance(expected_list, (list, tuple)):
        raise Exception(f"The expected answer for uuid {question_data.get('uuid', 'unknown')} is not a list or tuple.")

    # Normalize each element in the expected answer.
    expected_normalized = set(normalize(item) for item in expected_list)

    # Process and normalize the chat answer.
    if not isinstance(chat_answer, (list, tuple)):
        print("Error in list_of_tuples verification: chat_answer is not a list or tuple.")
        return False
    try:
        chat_normalized = set(normalize(item) for item in chat_answer)
    except Exception as e:
        print("Error normalizing chat_answer:", e)
        return False

    # Compare the two sets.
    return chat_normalized == expected_normalized

def get_opsin(chat_answer):
    base_url = "https://opsin.ch.cam.ac.uk/opsin/"
    api_url = f"{base_url}{chat_answer}.json"
    try:
        response = requests.get(api_url)
        response.raise_for_status()
        data = response.json()
        return data.get("smiles", "")
    except Exception as e:
        return False

def check_range(chat_answer, question_data):
    try:
        value = float(chat_answer)
        low, high = ast.literal_eval(question_data["answer_range"])
        return low <= value <= high
    except Exception as e:
        print("Error in range verification:", e)
        return False

def check_canonical_smi(chat_answer, question_data):
    chat_can = canonicalize_smiles(chat_answer)
    expected_can = canonicalize_smiles(question_data.get("answer"))
    if chat_can is None or expected_can is None:
        return False
    return chat_can == expected_can

def canonicalize_smiles(smiles_str):
    try:
        mol = Chem.MolFromSmiles(smiles_str)
        if mol is None:
            return None
        Chem.RemoveStereochemistry(mol)
        return Chem.MolToSmiles(mol, canonical=True)
    except Exception:
        return None

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            # Only process non-empty lines
            if line.strip():
                data.append(json.loads(line))
    return data

## Results paths

In [37]:
results_paths = {
    "gpt-4o":"batch_output/gpt-4o.jsonl",
    "gpt-4o-tags":"batch_output/gpt-4o-tags.jsonl",
    "o3-mini-low":"batch_output/o3-mini-low.jsonl",
    "o3-mini-medium":"batch_output/o3-mini-medium.jsonl",
    "o3-mini-high":"batch_output/o3-mini-high.jsonl",
}

## Load questions

In [38]:
# Read the JSONL file with all questions
all_questions = read_jsonl("questions/chemiq.jsonl") + read_jsonl("questions/additional_smiles_to_iupac.jsonl")

# Index by uuid for easier mapping
question_dict = {q["uuid"]: q for q in all_questions}

## Check answers

In [35]:
# Some questions have been removed from the question set but still exist in the batch results
missing_uuids = []

for model, path in results_paths.items():

    data_records = []
    results_list = read_jsonl(path)

    # --- Process every answered question ---
    for api_response in results_list:
        uuid = api_response["custom_id"]

        # UUID is not contained in question set
        # Some questions were removed
        if uuid not in question_dict:
            missing_uuids.append(uuid)
            continue
            
        q = question_dict[uuid]

        # model output + token stats
        model_answer = process_answer(
            api_response["response"]["body"]["choices"][0]["message"]["content"]
        )
        usage = api_response["response"]["body"]["usage"]
        completion_tokens = usage["completion_tokens"]
        reasoning_tokens = usage["completion_tokens_details"]["reasoning_tokens"]

        # verification
        method = q["verification_method"]
        if   method == "exact_match":     is_correct = check_exact_match(model_answer, q)
        elif method == "list_of_tuples":  is_correct = check_list_of_tuples(model_answer, q)
        elif method == "range":           is_correct = check_range(model_answer, q)
        elif method == "canonical_smi_match":
            is_correct = check_canonical_smi(model_answer, q)
        elif method == "opsin":
            opsin_smiles = get_opsin(model_answer)
            is_correct    = check_canonical_smi(opsin_smiles, q)
        else:
            raise ValueError(f"Unsupported verification method: {method}")

        data_records.append({
            "model":             model,
            "uuid":              uuid,
            "question_category": q["question_category"],
            "sub_category":      q["sub_category"],
            "ChemIQ":            q["ChemIQ"],
            "success":           is_correct,
            "expected_answer":   q["answer"],
            "model_answer":      model_answer,
            "opsin_smiles":      locals().get("opsin_smiles", np.nan),
            "completion_tokens": completion_tokens,
            "reasoning_tokens":  reasoning_tokens,
        })

    # --- Add rows for any unanswered questions ---
    answered = {r["uuid"] for r in data_records}
    for uuid, q in question_dict.items():
        if uuid not in answered:
            data_records.append({
                "model":             model,
                "uuid":              uuid,
                "question_category": q["question_category"],
                "sub_category":      q["sub_category"],
                "ChemIQ":            q["ChemIQ"],
                "success":           False,
                "expected_answer":   q["answer"],
                "model_answer":      np.nan,
                "opsin_smiles":      np.nan,
                "completion_tokens": np.nan,
                "reasoning_tokens":  np.nan,
            })

    # Build DataFrame, enforce column order
    df = pd.DataFrame(data_records)
    cols = ["model", "uuid", "question_category", "sub_category", "ChemIQ",
            "success", "expected_answer", "model_answer", "opsin_smiles",
            "completion_tokens", "reasoning_tokens"]
    df = df[cols]

    # write dataframe to csv
    df.to_csv(f"results/{model}_results.csv", index=False)

[10:23:20] SMILES Parse Error: unclosed ring for input: 'n1ccc(C(=O)N1CC1CSCC1)cc1'
[10:23:20] SMILES Parse Error: unclosed ring for input: 'C1CC(CC#C)N=[N+]=[N-]C2CCCC2'
[10:23:20] Explicit valence for atom # 3 F, 3, is greater than permitted
[10:23:20] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20
[10:23:20] Can't kekulize mol.  Unkekulized atoms: 2 3 10
[10:23:20] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 7 8 9
[10:23:20] Can't kekulize mol.  Unkekulized atoms: 3 4 5
[10:23:20] SMILES Parse Error: unclosed ring for input: 'CC(C)C1=CN(C(=O)C2=CC=CC=C2)S(=O)(=O)N(C3=CC=C(C=C3)C4=CC=CC=C4)C(=O)COC'
[10:23:28] SMILES Parse Error: syntax error while parsing: C#Cc1cccnc1C(CCl)[N2+]N=[N-]
[10:23:28] SMILES Parse Error: Failed parsing SMILES 'C#Cc1cccnc1C(CCl)[N2+]N=[N-]' for input: 'C#Cc1cccnc1C(CCl)[N2+]N=[N-]'
[10:23:28] SMILES Parse Error: syntax error while parsing: C#CC1CSC1Cu.N#N=[N+]=NC(F)(F)F
[10:23:28] SMILES Parse Error: Failed parsing SMILES 'C#CC1CSC1Cu.N#N=[

Error in range verification: could not convert string to float: "To solve this, we assess trends in the given SMILES scores. Specifically, the unknown molecule `c12cc(F)c(F)c(Cl)c1cco2` can be evaluated from patterns in substitution and halogen types.\n\n**Analysis**:\n- Each fluorine substitution likely decreases the score compared to chlorine or bromine. \n- Substituting one fluorine for chlorine lowers the score when compared with molecules with multiple chlorines (e.g., indexes 0 vs. 1 and 4).\n- Considering fluorine's smaller impact on score versus other halogens (based on observed precedent), we generate an interpolated score for the unknown molecule.\n\nGiven cross-trends:\nThe score for `c12cc(F)c(F)c(Cl)c1cco2` should sit between molecules with similar substitution patterns to 0 and 4 (assigned breakdown).\n\nExact score mechanism tracks halogen score decrement\n\ncalculatedduction Numeric interpolations"
Error in range verification: could not convert string to float: 'To dete

[10:23:43] Explicit valence for atom # 1 C, 5, is greater than permitted
[10:23:43] SMILES Parse Error: syntax error while parsing: The
[10:23:43] SMILES Parse Error: Failed parsing SMILES 'The' for input: 'The'
[10:23:43] Explicit valence for atom # 7 F, 2, is greater than permitted
[10:23:43] SMILES Parse Error: syntax error while parsing: To
[10:23:43] SMILES Parse Error: Failed parsing SMILES 'To' for input: 'To'
[10:23:43] SMILES Parse Error: syntax error while parsing: The
[10:23:43] SMILES Parse Error: Failed parsing SMILES 'The' for input: 'The'
[10:23:43] Can't kekulize mol.  Unkekulized atoms: 14 15 16 27 28
[10:23:43] SMILES Parse Error: syntax error while parsing: The
[10:23:43] SMILES Parse Error: Failed parsing SMILES 'The' for input: 'The'
[10:23:43] SMILES Parse Error: syntax error while parsing: The
[10:23:43] SMILES Parse Error: Failed parsing SMILES 'The' for input: 'The'
[10:23:43] SMILES Parse Error: syntax error while parsing: To
[10:23:43] SMILES Parse Error: Fai

Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.


[10:24:05] SMILES Parse Error: syntax error while parsing: FC(F)(F)C(O)(C1OCC1)C=CH2
[10:24:05] SMILES Parse Error: Failed parsing SMILES 'FC(F)(F)C(O)(C1OCC1)C=CH2' for input: 'FC(F)(F)C(O)(C1OCC1)C=CH2'
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[10:24:05] SMILES Parse Error: ring closure 2 duplicates bond between atom 6 and atom 10 for input: 'C1CCCC(C1)[C@]2(C3SC3)C(C)(c4ccccc4)2'
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 2 3 20 21 22
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 8
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 1 2 5 6 7
[10:24:05] SMILES Parse Error: syntax error while parsing: N#
[10:24:05] SMILES Parse Error: Failed parsing SMILES 'N#' for input: 'N#'
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 1 2 4 5 6
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6
[10:24:05] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 8 12 13 14
[10:24:05] Ca

Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.


[10:24:13] Can't kekulize mol.  Unkekulized atoms: 6 7 9
[10:24:13] Explicit valence for atom # 4 F, 2, is greater than permitted
[10:24:13] Can't kekulize mol.  Unkekulized atoms: 3 4 5 7 8 15 16
[10:24:13] Explicit valence for atom # 4 F, 2, is greater than permitted
[10:24:13] Explicit valence for atom # 3 C, 5, is greater than permitted
[10:24:13] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7
[10:24:13] Explicit valence for atom # 24 F, 3, is greater than permitted


Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.


[10:24:29] Explicit valence for atom # 3 F, 2, is greater than permitted
[10:24:29] Explicit valence for atom # 19 C, 6, is greater than permitted
[10:24:29] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 11 12 20 21 22 23 27 28
[10:24:29] SMILES Parse Error: syntax error while parsing: S1CC(C1)c2cc(cc(C)c2(7)C3CC3)c4cc(C5CCC5)cc([7])c4CCCl
[10:24:29] SMILES Parse Error: Failed parsing SMILES 'S1CC(C1)c2cc(cc(C)c2(7)C3CC3)c4cc(C5CCC5)cc([7])c4CCCl' for input: 'S1CC(C1)c2cc(cc(C)c2(7)C3CC3)c4cc(C5CCC5)cc([7])c4CCCl'
[10:24:29] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[10:24:29] Explicit valence for atom # 3 F, 2, is greater than permitted
[10:24:29] Explicit valence for atom # 10 O, 3, is greater than permitted
[10:24:29] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 7
[10:24:29] SMILES Parse Error: syntax error while parsing: c1cc(C(C)CCC)cc(c1)CNHC(=O)C
[10:24:29] SMILES Parse Error: Failed parsing SMILES 'c1cc(C(C)CCC)cc(c1)CNHC(=O)C' for input: 'c1cc(C(C)CCC)cc(c1)CNHC

Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.


[10:24:52] Explicit valence for atom # 1 C, 5, is greater than permitted
[10:24:52] Can't kekulize mol.  Unkekulized atoms: 17 21 22 24 25
[10:24:52] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[10:24:52] SMILES Parse Error: syntax error while parsing: C1(=O)C=COC(OCH3)=C1
[10:24:52] SMILES Parse Error: Failed parsing SMILES 'C1(=O)C=COC(OCH3)=C1' for input: 'C1(=O)C=COC(OCH3)=C1'
[10:24:52] non-ring atom 2 marked aromatic
[10:24:52] Can't kekulize mol.  Unkekulized atoms: 12 16 18 19
[10:24:52] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 20 21 22
[10:24:52] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 11
[10:24:52] SMILES Parse Error: syntax error while parsing: CN(S(=O)(=O)c1ccc(C)cc1)C(=O)CH2NC(C)C(=O)NCc2ccc(OC)cc2
[10:24:52] SMILES Parse Error: Failed parsing SMILES 'CN(S(=O)(=O)c1ccc(C)cc1)C(=O)CH2NC(C)C(=O)NCc2ccc(OC)cc2' for input: 'CN(S(=O)(=O)c1ccc(C)cc1)C(=O)CH2NC(C)C(=O)NCc2ccc(OC)cc2'
[10:24:52] Can't kekulize mol.  Unkekulized atoms: 9 15 23


Error in list_of_tuples verification: chat_answer is not a list or tuple.


[10:25:00] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[10:25:00] SMILES Parse Error: unclosed ring for input: 'n(C1CCCC1)nnc(C2CCC2)c1'
[10:25:00] Explicit valence for atom # 3 F, 2, is greater than permitted
[10:25:00] Can't kekulize mol.  Unkekulized atoms: 13 14 21 28 29
[10:25:00] SMILES Parse Error: syntax error while parsing: CCCc1cc(C)c(C/C=C\(C2SC2))cc1CCF
[10:25:00] SMILES Parse Error: Failed parsing SMILES 'CCCc1cc(C)c(C/C=C\(C2SC2))cc1CCF' for input: 'CCCc1cc(C)c(C/C=C\(C2SC2))cc1CCF'
[10:25:00] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14
[10:25:00] Can't kekulize mol.  Unkekulized atoms: 0 1 5 6 12 23 24 25 26 30
[10:25:00] Can't kekulize mol.  Unkekulized atoms: 6 7 11 12 19 20 26
[10:25:00] SMILES Parse Error: unclosed ring for input: 'CC(C1CCCCC1)=C(CF3)'


Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
Error in list_of_tuples verification: chat_answer is not a list or tuple.
