## Workflow plan

 - Create a list of SMARTS strings for the bond types reported in reference 10.1021/ed085p532 (Tables 2).
 - Develop an RDKit-based query tool that checks whether the current SDF file contains the molecules with specified bond type.
 - If a given bond type is found in the molecule, the constitutive correction for that species will be included.
 - Constitutive corrections are added along atomic Pascal constants (procedure for unknown molecule).

### Materials:
 - [Substructure Filtering in RDKit ](https://www.youtube.com/watch?v=Z1PrErlmTGI)
 - [SMARTS notation](https://en.wikipedia.org/wiki/SMILES_arbitrary_target_specification)

## Compounds loader

In [225]:
from pathlib import Path
from typing import Any

import pytest
from rdkit import Chem
from rdkit.Chem import Mol, MolToSmarts, RemoveHs

from src import DIAMAG_COMPOUND_CONSTITUTIVE_CORR_SUBDIR
from src.constants.common_molecules import COMMON_MOLECULES
from src.core.compound import MBCompound
from src.core.molecule import MBMolecule
from src.loader import SDFLoader


### Constitutive corrections for relevent bond types

In [226]:
RELEVENT_BOND_TYPES = {
    # There are additional aromatic rings that were not listed in the article by Bain et al. For these rings, constitutive corrections cannot be calculated reliably.
    # SMARTS notations for the bonds C=N, C=C, and N=N must be written explicitly to prevent inclusion of constitutive corrections for these bonds when they are part of an aromatic ring.
    "C=C": {
        "SMARTS": "[C;!$([c]);!$([C;R6]1=[C;R6]-[C;R6]-[C;R6]-[C;R6]-[C;R6]1);!$([C]=[C]-[C]=[C]);!$([C;H2]=[C;H1]-[CH2])]=[C;!$([c]);!$([C;R6]=[C;R6]-[C;R6]1-[C;R6]-[C;R6]-[C;R6]1);!$([C]=[C]-[C]=[C]);!$([C;H2]=[C;H1]-[CH2])]",  # C are NOT aromatic - this excludes aromatic rings that are not listed; excludes C=C-C=C, Ar-C=C and H2C=CH-CH2- group; all exclusions must be applied for both atoms
        "constitutive_corr": 5.5,
        "sdf_file": "C2H4.sdf",
    },
    "C#C": {
        "SMARTS": "[C;!$([C]-c);!$([C]([C])#[C]-[C](=O)-[C])]#[C;!$([C]-c);!$([C]([C])#[C]-[C](=O)-[C])]",  # C#C but without aromatic [c] neighbours; excluded RC#C-C(=O)R; all exclusions must be applied for both atoms
        "constitutive_corr": 0.8,
        "sdf_file": "C2H2.sdf",
    },
    "C=C-C=C": {
        "SMARTS": "[C;!$([c])]=[C;!$([c])]-[C;!$([c])]=[C;!$([c])]",  # TODO: what to do with conjugated double bond systems with 3 or more C=C bonds?
        "constitutive_corr": 10.6,
        "sdf_file": "C=C-C=C.sdf",
    },
    "Ar-C#C-Ar": {
        "SMARTS": "[c]-[#6;X2]#[#6;X2]-[c]",
        "constitutive_corr": 3.85,
        "sdf_file": "Ar-C#C-Ar.sdf",
    },
    "CH2=CH-CH2-": {
        "SMARTS": "[C;X3;H2]=[C;X3;H1]-[C;X4;H2]",
        "constitutive_corr": 4.5,
        "sdf_file": "allyl chloride.sdf",
    },
    "C=O": {
        "SMARTS": "[C;X3;!$([C]-[c]);!$([C](=O)[O*]);!$([C](=O)[N*])]=[O;X1]",  # Exclude C=O groups attached to aromatic carbons or bonded to O/N in any form.
        "constitutive_corr": 6.3,
        "sdf_file": "allyl chloride.sdf",
    },
    "COOH": {
        "SMARTS": "[C;X3;!$(C-c)](=O)[$([O;H1;X2]),$([O-;X1])]",  # IMPORTANT! Both COO- and COOH groups will be matched. Loosening of some SMARTS conditions will enhance accuarcy of the final molecule diamag. This must be noted in Software's MANUAL.
        "constitutive_corr": -5.0,
        "sdf_file": "COOH.sdf",
    },
    "COOR": {
        "SMARTS": "[C;X3;!$([C]-[c])](=[O])[O]-[#6]",  # Intentional extention for considering RCOOR and RCOOAr bond types. Otherwise, RCOOAr would always be excluded which would decrease final diamag result. This must be noted in Software's MANUAL.
        "constitutive_corr": -5.0,
        "sdf_file": "COOR.sdf",
    },
    "C(=O)NH2": {
        "SMARTS": "[C;X3;!$([C]-[c])](=[O])[$([N;H2]),$([N;H1][#6])]",  # Intentional extention for considering not only RCONH2, but also RCONHR and RCONHAr. This must be noted in Software's MANUAL.
        "constitutive_corr": -3.5,
        "sdf_file": "COONH2.sdf",
    },
    "N=N": {
        "SMARTS": "[#7]=[#7]",  # N are NOT aromatic - this excludes aromatic rings that are not listed. Intentionally included N=N where N atoms posses negative or positive charge, i.e. azides R-N=N(+)=N(-). This must be noted in Software's MANUAL.
        "constitutive_corr": 1.85,
        "sdf_file": "N=N.sdf",
    },
    "C=N-": {
        "SMARTS": "[C;X2,X3;!$([c])]=[N;!$([c]);!$([N](=[C])-[N]=[C])]",  # N and C are NOT aromatic - this excludes aromatic rings that are not listed. Carbon with two or three neighbours (cases C=C=N- and Any-C=N-); excludes bond type R2C=Nâ€“N=CR2
        "constitutive_corr": 8.15,
        "sdf_file": "C=N.sdf",
    },
    "-N#C": {
        "SMARTS": "[N+;X2]#[C-;X1]",  # Covers isocyanide R-N(+)#C(-) resonance structure.
        "constitutive_corr": 0.0,
        "sdf_file": "HCN-Any.sdf",
    },
    "-C#N": {
        "SMARTS": "[C;X2]#[N;X1]",  # sp carbon with two neighours (including hydrogen) and N with only one neighour
        "constitutive_corr": 0.8,
        "sdf_file": "HCN.sdf",
    },
    "N=O": {
        "SMARTS": "[N;X2;!$([N+])]=[O;X1]",  # N bonded with two neighours (including hydrogen; coordination bond [N+][O-] excluded) and O with only one neighour
        "constitutive_corr": 1.7,
        "sdf_file": "N=O.sdf",
    },
    "-NO2": {
        "SMARTS": "[N+;X3]([O-;X1])=[O;X1]",  # N bonded with 3 neighours (including hydrogen; coordination bond [N+][O-] included) and two O atoms with only one neighour
        "constitutive_corr": -2.0,
        "sdf_file": "-NO2.sdf",
    },
    "C-Cl": {
        "SMARTS": "[C;!$([C]([C])([C])([Cl])[Cl]);!$([C;H1]([C])([Cl])[Cl]);!$([C]([C])([C])([Cl])-[C]([C])([C])[Cl])]-[Cl]",  # aliphatic C bonded to Cl, excluding cases R2CCl2, RCHCl2 and Cl-CR2-CR2-Cl
        "constitutive_corr": 3.1,
        "sdf_file": "R2CCl2.sdf",
    },
    "Cl-CR2CR2-Cl": {
        "SMARTS": "[C;X4]([C])([C])([Cl])-[C;X4]([C])([C])[Cl]",
        "constitutive_corr": 4.3,
        "sdf_file": "R4C2Cl2.sdf",
    },
    "R2CCl2": {
        "SMARTS": "[C;X4]([C])([C])([Cl])[Cl]",
        "constitutive_corr": 1.44,
        "sdf_file": "R2CCl2.sdf",
    },
    "RCHCl2": {
        "SMARTS": "[C;X4;H1]([C])([Cl])[Cl]",
        "constitutive_corr": 6.43,
        "sdf_file": "RCHCl2.sdf",
    },
    "C-Br": {
        "SMARTS": "[C;!$([C]([C])([C])([Br])-[C]([C])([C])[Br])]-[Br]",  # excluding Br-CR2-CR2-Br bond type
        "constitutive_corr": 4.1,
        "sdf_file": "C-Br.sdf",
    },
    "Br-CR2-CR2-Br": {
        "SMARTS": "[C;X4]([C])([C])([Br])-[C;X4]([C])([C])[Br]",
        "constitutive_corr": 6.24,
        "sdf_file": "Br-CR2-CR2-Br.sdf",
    },
    "C-I": {
        "SMARTS": "[C;!c]-[I]",
        "constitutive_corr": 4.1,
        "sdf_file": "C-I.sdf",
    },
    "Ar-OH": {
        "SMARTS": "[c]-[O;X2;H1]",
        "constitutive_corr": -1,
        "sdf_file": "Ph-OH.sdf",
    },
    "Ar-NR2": {
        "SMARTS": "[c]-[N;X3]([C])[C]",
        "constitutive_corr": 1,
        "sdf_file": "Ar-NR2.sdf",
    },
    "Ar-C(=O)R": {
        "SMARTS": "[c]-[C;X3](=[O])[C]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-COR.sdf",
    },
    "Ar-COOR": {
        "SMARTS": "[c]-[C;X3](=[O])[O]-[C]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-COOR.sdf",
    },
    "Ar-C=C": {
        "SMARTS": "[c]-[C;X3]=[C]",
        "constitutive_corr": -1.00,
        "sdf_file": "Ar-C=C.sdf",
    },
    "Ar-C#C": {
        "SMARTS": "[c]-[C;X2]#[C;X2;!$([C]-[c])]",  # excludes Ar-C#C-Ar bond type
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-C#C.sdf",
    },
    "Ar-OR": {
        "SMARTS": "[c]-[O]-[C]",
        "constitutive_corr": -1,
        "sdf_file": "Ar-OR.sdf",
    },
    "Ar-CHO": {
        "SMARTS": "[c]-[C;X3;H1]=[O]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-CHO.sdf",
    },
    "Ar-Ar": {
        "SMARTS": "[c]-[c]",  # RDKit recognize that within aromatic ring there is no single bond c-c.
        "constitutive_corr": -0.5,
        "sdf_file": "Ar-CHO.sdf",
    },
    "Ar-NO2": {
        "SMARTS": "[c]-[N+;X3](=[O;X1])[O-;X1]",
        "constitutive_corr": -0.5,
        "sdf_file": "Ar-NO2.sdf",
    },
    "Ar-Br": {
        "SMARTS": "[c]-[Br]",
        "constitutive_corr": -3.5,
        "sdf_file": "Ar-Br.sdf",
    },
    "Ar-Cl": {
        "SMARTS": "[c]-[Cl]",
        "constitutive_corr": -2.5,
        "sdf_file": "Ar-Cl.sdf",
    },
    "Ar-I": {
        "SMARTS": "[c]-[I]",
        "constitutive_corr": -3.5,
        "sdf_file": "Ar-I.sdf",
    },
    "Ar-COOH": {
        "SMARTS": "[c]-[C;X3](=[O])[O;X2;H1]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-COOH.sdf",
    },
    "Ar-C(=O)NH2": {
        "SMARTS": "[c]-[C;X3](=[O])[N;X3;H2]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-CONH2.sdf",
    },
    "R2C=N-N=CR2": {
        "SMARTS": "[C;X3]([C])([C])=[N;X2]-[N;X2]=[C;X3]([C])[C]",
        "constitutive_corr": 10.2,
        "sdf_file": "R2C=N-N=CR2.sdf",
    },
    "RC#C-C(=O)R": {
        "SMARTS": "[C;X2]([C])#[C;X2]-[C;X3](=[O])[C]",
        "constitutive_corr": 10.2,
        "sdf_file": "RC#C-COR.sdf",
    },
    # TODO: What about fused-ring systems?
    # NOTE: Provided SMARTS take into account any substituent on any carbon atom and have a different meaning than corresponding SMILES.
    # NOTE: The cyclohexadiene ring was excluded for simplicity of the SMARTS notation. The constitutive corr for this ring is very close to the sum for two C=C bonds.
    "benzene": {
        "SMARTS": "c1ccccc1",
        "constitutive_corr": -1.4,
        "sdf_file": "benzene.sdf",
    },
    "cyclobutane": {
        "SMARTS": "C1CCC1",
        "constitutive_corr": 7.2,
        "sdf_file": "cyclobutane.sdf",
    },
    "cyclohexane": {
        "SMARTS": "C1CCCCC1",
        "constitutive_corr": 3.0,
        "sdf_file": "cyclohexane.sdf",
    },
    "cyclohexene": {
        "SMARTS": "C1CC=CCC1",
        "constitutive_corr": 6.9,
        "sdf_file": "cyclohexene.sdf",
    },
    "cyclopentane": {
        "SMARTS": "C1CCCC1",
        "constitutive_corr": 0.0,
        "sdf_file": "cyclopentane.sdf",
    },
    "cyclopropane": {
        "SMARTS": "C1CC1",
        "constitutive_corr": 7.2,
        "sdf_file": "cyclopropane.sdf",
    },
    "dioxane": {
        "SMARTS": "[$(O1CCCCO1),$(O1CCOCC1),$(O1CCCOC1)]",  # consider all dioxane isomers (1,2-dioxane, 1,3-dioxane and 1,4-dioxane). This must be noted in Software's MANUAL.
        "constitutive_corr": 5.5,
        "sdf_file": "1,4-dioxane.sdf",
    },
    "furan": {
        "SMARTS": "o1cccc1",
        "constitutive_corr": -2.5,
        "sdf_file": "furan.sdf",
    },
    "imidazole": {
        "SMARTS": "n1cncc1",  # all imidazole protonation states are included. This must be noted in Software's MANUAL.
        "constitutive_corr": 8.0,
        "sdf_file": "imidazole.sdf",
    },
    "isoxazole": {
        "SMARTS": "[$(o1nccc1),$(o1cncc1)]",
        "constitutive_corr": 1.0,  # Assumes the same constant for isoxazole and oxazole rings. This must be noted in Software's MANUAL.
        "sdf_file": "isoxazole.sdf",
    },
    "morpholine": {
        "SMARTS": "[$(O1CCNCC1),$(O1NCCCC1),$(O1CNCCC1)]",
        "constitutive_corr": 5.5,  # Assumes the same constant for all 6-membered morpholine isomers. This must be noted in Software's MANUAL.
        "sdf_file": "1,4-morpholine.sdf",
    },
    "piperazine": {
        "SMARTS": "[$(N1CCNCC1),$(N1CNCCC1),$(N1NCCCC1),$([N+]1CCNCC1),$([N+]1CNCCC1),$([N+]1NCCCC1),$(N1CC[N+]CC1),$(N1C[N+]CCC1),$(N1[N+]CCCC1),$([N-]1CCNCC1),$([N-]1CNCCC1),$([N-]1NCCCC1),$(N1CC[N-]CC1),$(N1C[N-]CCC1),$(N1[N-]CCCC1)]",  # all isomers of piperazine and their protonated forms are assumed to have the same constant. This must be noted in Software's MANUAL.
        "constitutive_corr": 7.0,
        "sdf_file": "piperazine.sdf",
    },
    "piperidine": {
        "SMARTS": "[$(N1CCCCC1),$([N-]1CCCCC1),$([N+]1CCCCC1)]",  # Assumes the same constant for piperidine at different protonation states. This must be noted in Software's MANUAL.
        "constitutive_corr": 3.0,
        "sdf_file": "piperidine.sdf",
    },
    "pyrazine": {
        "SMARTS": "[$(n1ccncc1),$([n+]1ccncc1),$(n1cc[n+]cc1)]",  # Assumes the same constant for pyrazine at different protonation states. This must be noted in Software's MANUAL.
        "constitutive_corr": 9.0,
        "sdf_file": "piperidine.sdf",
    },
    "pyrimidine": {
        "SMARTS": "[$(n1cnccc1),$([n+]1cnccc1),$(n1c[n+]ccc1)]",  # Assumes the same constant for pyrimidine at different protonation states. This must be noted in Software's MANUAL.
        "constitutive_corr": 6.5,
        "sdf_file": "pyrimidine.sdf",
    },
    "pyridine": {
        "SMARTS": "[$(n1ccccc1),$([n+]1ccccc1)]",  # Assumes the same constant for pyridine at different protonation states. This must be noted in Software's MANUAL.
        "constitutive_corr": 0.5,
        "sdf_file": "pyridine.sdf",
    },
    "pyrones": {
        "SMARTS": "[$([#8]=[#6]1:[#6]:[#6]:[#6]:[#6]:[#8]:1),$([#8]=[#6]1:[#6]:[#6]:[#8]:[#6]:[#6]:1)]",  # According to the Bain et al. article the same value for alpha- and gamma-pyrone isomers. This must be noted in Software's MANUAL.
        "constitutive_corr": -1.4,
        "sdf_file": "gamma-pyrone.sdf",
    },
    "pyrrole": {
        "SMARTS": "[$(n1cccc1),$([n-]1cccc1)]",  # Assumes the same constant for pyrrole at different protonation states. This must be noted in Software's MANUAL.
        "constitutive_corr": -3.5,
        "sdf_file": "pyrrole.sdf",
    },
    "pyrrolidine": {
        "SMARTS": "[$(N1CCCC1),$([N+]1CCCC1)]",  # Assumes the same constant for pyrrole at different protonation states. This must be noted in Software's MANUAL.
        "constitutive_corr": 0.0,
        "sdf_file": "pyrrolidine.sdf",
    },
    "tetrahydrofuran": {
        "SMARTS": "O1CCCC1",
        "constitutive_corr": 0.0,
        "sdf_file": "tetrahydrofuran.sdf",
    },
    "thiazoles": {
        "SMARTS": "[$(n1cscc1),$(n1sccc1),$([n+]1cscc1),$([n+]1sccc1)]",  # Assumes the same constant for thiazole and isothiazole at their different protonation states. This must be noted in Software's MANUAL.
        "constitutive_corr": -3.0,
        "sdf_file": "thiazole.sdf",
    },
    "thiophene": {
        "SMARTS": "s1cccc1",
        "constitutive_corr": -7.0,
        "sdf_file": "thiophene.sdf",
    },
    "triazine": {
        "SMARTS": "[$(n1nnccc1),$(n1nccnc1),$(n1cncnc1),$([nH+]1nnccc1),$(n1[nH+]nccc1),$(n1n[nH+]ccc1),$([nH+]1nccnc1),$(n1[nH+]ccnc1),$(n1ncc[nH+]c1),$([nH+]1cncnc1),$(n1c[nH+]cnc1),$(n1cnc[nH+]c1)]",  # Assumes the same constant for three triazine isomers and thier monoprotonated states. This must be noted in Software's MANUAL.
        "constitutive_corr": -1.4,
        "sdf_file": "1,2,3-triazine.sdf",
    },
}

In [227]:
import json

compound: MBCompound = SDFLoader.Load(
    "1,2,3-triazine.sdf", subdir=DIAMAG_COMPOUND_CONSTITUTIVE_CORR_SUBDIR
)

mol = compound.GetMols(to_rdkit=True)[0]
mol = RemoveHs(mol)
print(MolToSmarts(mol, isomericSmiles=True))

for idx, (key, rbt) in enumerate(RELEVENT_BOND_TYPES.items()):
    if mol.HasSubstructMatch(Chem.MolFromSmarts(rbt["SMARTS"])):
        print(f'{idx}: Match: "{key}": {json.dumps(rbt, indent=4)}')
    else:
        print(f"{idx}: Didn't match: {key}")

[#7]1:[#7]:[#6]:[#6]:[#6]:[#7]:1
0: Didn't match: C=C
1: Didn't match: C#C
2: Didn't match: C=C-C=C
3: Didn't match: Ar-C#C-Ar
4: Didn't match: CH2=CH-CH2-
5: Didn't match: C=O
6: Didn't match: COOH
7: Didn't match: COOR
8: Didn't match: C(=O)NH2
9: Didn't match: N=N
10: Didn't match: C=N-
11: Didn't match: -N#C
12: Didn't match: -C#N
13: Didn't match: N=O
14: Didn't match: -NO2
15: Didn't match: C-Cl
16: Didn't match: Cl-CR2CR2-Cl
17: Didn't match: R2CCl2
18: Didn't match: RCHCl2
19: Didn't match: C-Br
20: Didn't match: Br-CR2-CR2-Br
21: Didn't match: C-I
22: Didn't match: Ar-OH
23: Didn't match: Ar-NR2
24: Didn't match: Ar-C(=O)R
25: Didn't match: Ar-COOR
26: Didn't match: Ar-C=C
27: Didn't match: Ar-C#C
28: Didn't match: Ar-OR
29: Didn't match: Ar-CHO
30: Didn't match: Ar-Ar
31: Didn't match: Ar-NO2
32: Didn't match: Ar-Br
33: Didn't match: Ar-Cl
34: Didn't match: Ar-I
35: Didn't match: Ar-COOH
36: Didn't match: Ar-C(=O)NH2
37: Didn't match: R2C=N-N=CR2
38: Didn't match: RC#C-C(=O)R