In [2]:
from rdkit import Chem

# Load SMILES and convert to canonical SMILES
def load_canonical_smiles(file):
    with open(file, 'r') as f:
        smiles = [smi.strip() for smi in f if Chem.MolFromSmiles(smi.strip())]
    canonical_smiles = {Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True) for smi in smiles}
    is_qac_count = sum(1 for smi in smiles if "[N+]" in smi or "[n+]" in smi)
    return canonical_smiles, is_qac_count

# File paths
generated_smiles_file = "qac-0315-clean.txt"
reference_smiles_file = "outputs/set_0-generated-20241230.txt"

# Load canonical SMILES and QAC count
generated_canonical, generated_is_qac = load_canonical_smiles(generated_smiles_file)
reference_canonical, _ = load_canonical_smiles(reference_smiles_file)

# Calculate metrics
uniqueness = len(generated_canonical) / len(open(generated_smiles_file).readlines())
novelty = len(generated_canonical - reference_canonical) / len(generated_canonical)
is_qac_ratio = generated_is_qac / len(open(generated_smiles_file).readlines())

# Output results
print(f"Uniqueness: {uniqueness:.4f}")
print(f"Novelty: {novelty:.4f}")
print(f"QAC Ratio: {is_qac_ratio:.4f}")


Uniqueness: 0.9762
Novelty: 0.9965
QAC Ratio: 0.9592
