## Workflow plan

 - Create a list of SMARTS strings for the bond types reported in reference 10.1021/ed085p532 (Tables 2).
 - Develop an RDKit-based query tool that checks whether the current SDF file contains the molecules with specified bond type.
 - If a given bond type is found in the molecule, the constitutive correction for that species will be included.
 - Constitutive corrections are added along atomic Pascal constants (procedure for unknown molecule).

### Materials:
 - [Substructure Filtering in RDKit ](https://www.youtube.com/watch?v=Z1PrErlmTGI)
 - [SMARTS notation](https://en.wikipedia.org/wiki/SMILES_arbitrary_target_specification)

## Compounds loader

In [241]:
from pathlib import Path
from typing import Any

import pytest
from rdkit import Chem
from rdkit.Chem import Mol, MolToSmarts, RemoveHs

from src import DIAMAG_COMPOUND_CONSTITUTIVE_CORR_SUBDIR
from src.constants.common_molecules import COMMON_MOLECULES
from src.core.compound import MBCompound
from src.core.molecule import MBMolecule
from src.loader import SDFLoader


In [None]:
compound: MBCompound = SDFLoader.Load(
    "allyl_chloride2.sdf", subdir=DIAMAG_COMPOUND_CONSTITUTIVE_CORR_SUBDIR
)
# TODO: Read all SDF files by using for loop
mols_list = []
# Creates SMARTS notation for molecules via RDKit function
for mol in compound.GetMols(to_rdkit=True):
    mol = mol = RemoveHs(mol)
    mols_list.append(mol)
    print(MolToSmarts(mol, isomericSmiles=True))

[#6]=[#6](-[#6]-[#17])-[#6]


### Constitutive corrections for relevent bond types

In [None]:
RELEVENT_BOND_TYPES = {
    "C=C": {
        "SMARTS": "[C;!$([C]=[C]-[C]=[C]);!$([C;H2]=[C;H1]-[CH2])]=[C;!$([C]=[C]-[C]=[C]);!$([C;H2]=[C;H1]-[CH2])]",  # excludes C=C-C=C, Ar-C=C and H2C=CH-CH2- group; exclusion must be applied for both atoms
        "constitutive_corr": 5.5,
        "sdf_file": "C2H4.sdf",
    },
    "C#C": {
        "SMARTS": "[C;!$([C]-c);!$([C]([C])#[C]-[C](=O)-[C])]#[C;!$([C]-c);!$([C]([C])#[C]-[C](=O)-[C])]",  # C#C but without aromatic [c] neighbours; excluded RC#C-C(=O)R; exclusion must be applied for both atoms
        "constitutive_corr": 0.8,
        "sdf_file": "C2H2.sdf",
    },
    "C=C-C=C": {
        "SMARTS": "[C]=[C]-[C]=[C]",  # TODO: what to do with conjugated double bond systems with 3 or more C=C bonds?
        "constitutive_corr": 10.6,
        "sdf_file": "C=C-C=C.sdf",
    },
    # TODO: Check again SMARTS notations below
    "Ar-C#C-Ar": {
        "SMARTS": "[c]-[C]#[C]-[c]",
        "constitutive_corr": 3.85,
        "sdf_file": "C2H4.sdf",
    },
    "CH2=CH-CH2-": {
        "SMARTS": "[C;X3;H2]=[C;X3;H1]-[C;X4;H2]",
        "constitutive_corr": 4.5,
        "sdf_file": "allyl chloride.sdf",
    },
    "C=O": {
        "SMARTS": "[C;X3;!$([C]-c);!$([C]-[N,O])]=O",  # TODO: Finish
        "constitutive_corr": 6.3,
        "sdf_file": "allyl chloride.sdf",
    },
    "COOH": {
        "SMARTS": "[C;X3;!$(C-c)](=O)[O;H]",
        "constitutive_corr": -5.0,
        "sdf_file": "COOH.sdf",
    },
    "COOR": {
        "SMARTS": "[C;X3;!$(C-c)](=O)O[C]",
        "constitutive_corr": -5.0,
        "sdf_file": "COOR.sdf",
    },
    "C(=O)NH2": {
        "SMARTS": "[C;X3;!$(C-c)](=O)[N;H2]",
        "constitutive_corr": -3.5,
        "sdf_file": "COONH2.sdf",
    },
    "N=N": {
        "SMARTS": "N=N",
        "constitutive_corr": 1.85,
        "sdf_file": "N=N.sdf",
    },
    "C=N-": {
        "SMARTS": "[C;X2,X3]=[N;!$([N](=[C])-[N]=[C])]",  # carbon with two or three neighbours (cases C=C=N- and Any-C=N-); excludes bond type R2C=Nâ€“N=CR2
        "constitutive_corr": 8.15,
        "sdf_file": "C=N.sdf",
    },
    "-N#C": {
        "SMARTS": "[N+;X2]#[C-;X1]",  # covers isocyanide R-N(+)#C(-) resonance structure
        "constitutive_corr": 0.0,
        "sdf_file": "HCN-Any.sdf",
    },
    "-C#N": {
        "SMARTS": "[C;X2]#[N;X1]",  # sp carbon with two neighours (including hydrogen) and N with only one neighour
        "constitutive_corr": 0.8,
        "sdf_file": "HCN.sdf",
    },
    "N=O": {
        "SMARTS": "[N;X2;!$([N+])]=[O;X1]",  # N bonded with two neighours (including hydrogen; coordination bond [N+][O-] excluded) and O with only one neighour
        "constitutive_corr": 1.7,
        "sdf_file": "N=O.sdf",
    },
    "-NO2": {
        "SMARTS": "[N+;X3]([O-;X1])=[O;X1]",  # N bonded with 3 neighours (including hydrogen; coordination bond [N+][O-] included) and two O atoms with only one neighour
        "constitutive_corr": -2.0,
        "sdf_file": "-NO2.sdf",
    },
    "C-Cl": {
        "SMARTS": "[C;!$([C]([C])([C])([Cl])[Cl]);!$([C;H1]([C])([Cl])[Cl]);!$([C]([C])([C])([Cl])-[C]([C])([C])[Cl])]-[Cl]",  # aliphatic C bonded to Cl, excluding cases R2CCl2, RCHCl2 and Cl-CR2-CR2-Cl
        "constitutive_corr": 3.1,
        "sdf_file": "R2CCl2.sdf",
    },
    "Cl-CR2CR2-Cl": {
        "SMARTS": "[C;X4]([C])([C])([Cl])-[C;X4]([C])([C])[Cl]",
        "constitutive_corr": 4.3,
        "sdf_file": "R4C2Cl2.sdf",
    },
    "R2CCl2": {
        "SMARTS": "[C;X4]([C])([C])([Cl])[Cl]",
        "constitutive_corr": 1.44,
        "sdf_file": "R2CCl2.sdf",
    },
    "RCHCl2": {
        "SMARTS": "[C;X4;H1]([C])([Cl])[Cl]",
        "constitutive_corr": 6.43,
        "sdf_file": "RCHCl2.sdf",
    },
    "C-Br": {
        "SMARTS": "[C;!$([C]([C])([C])([Br])-[C]([C])([C])[Br])]-[Br]",  # excluding Br-CR2-CR2-Br bond type
        "constitutive_corr": 4.1,
        "sdf_file": "C-Br.sdf",
    },
    "Br-CR2-CR2-Br": {
        "SMARTS": "[C;X4]([C])([C])([Br])-[C;X4]([C])([C])[Br]",
        "constitutive_corr": 6.24,
        "sdf_file": "Br-CR2-CR2-Br.sdf",
    },
    "C-I": {
        "SMARTS": "[C;!c]-[I]",
        "constitutive_corr": 4.1,
        "sdf_file": "C-I.sdf",
    },
    "Ar-OH": {
        "SMARTS": "[c]-[O;X2;H1]",
        "constitutive_corr": -1,
        "sdf_file": "Ph-OH.sdf",
    },
    "Ar-NR2": {
        "SMARTS": "[c]-[N;X3]([C])[C]",
        "constitutive_corr": 1,
        "sdf_file": "Ar-NR2.sdf",
    },
    "Ar-C(=O)R": {
        "SMARTS": "[c]-[C;X3](=[O])[C]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-COR.sdf",
    },
    "Ar-COOR": {
        "SMARTS": "[c]-[C;X3](=[O])[O]-[C]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-COOR.sdf",
    },
    "Ar-C=C": {
        "SMARTS": "[c]-[C;X3]=[C]",
        "constitutive_corr": -1.00,
        "sdf_file": "Ar-C=C.sdf",
    },
    "Ar-C#C": {
        "SMARTS": "[c]-[C;X2]#[C;X2;!$([C]-[c])]",  # excludes Ar-C#C-Ar bond type
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-C#C.sdf",
    },
    "Ar-OR": {
        "SMARTS": "[c]-[O]-[C]",
        "constitutive_corr": -1,
        "sdf_file": "Ar-OR.sdf",
    },
    "Ar-CHO": {
        "SMARTS": "[c]-[C;X3;H1]=[O]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-CHO.sdf",
    },
    "Ar-Ar": {
        "SMARTS": "[c]-[c]",  # RDKit recognize that within aromatic ring there is no single bond c-c.
        "constitutive_corr": -0.5,
        "sdf_file": "Ar-CHO.sdf",
    },
    "Ar-NO2": {
        "SMARTS": "[c]-[N+;X3](=[O;X1])[O-;X1]",
        "constitutive_corr": -0.5,
        "sdf_file": "Ar-NO2.sdf",
    },
    "Ar-Br": {
        "SMARTS": "[c]-[Br]",
        "constitutive_corr": -3.5,
        "sdf_file": "Ar-Br.sdf",
    },
    "Ar-Cl": {
        "SMARTS": "[c]-[Cl]",
        "constitutive_corr": -2.5,
        "sdf_file": "Ar-Cl.sdf",
    },
    "Ar-I": {
        "SMARTS": "[c]-[I]",
        "constitutive_corr": -3.5,
        "sdf_file": "Ar-I.sdf",
    },
    "Ar-COOH": {
        "SMARTS": "[c]-[C;X3](=[O])[O;X2;H1]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-COOH.sdf",
    },
    "Ar-C(=O)NH2": {
        "SMARTS": "[c]-[C;X3](=[O])[N;X3;H2]",
        "constitutive_corr": -1.5,
        "sdf_file": "Ar-CONH2.sdf",
    },
    "R2C=N-N=CR2": {
        "SMARTS": "[C;X3]([C])([C])=[N;X2]-[N;X2]=[C;X3]([C])[C]",
        "constitutive_corr": 10.2,
        "sdf_file": "R2C=N-N=CR2.sdf",
    },
    "RC#C-C(=O)R": {
        "SMARTS": "[C;X2]([C])#[C;X2]-[C;X3](=[O])[C]",
        "constitutive_corr": 10.2,
        "sdf_file": "RC#C-COR.sdf",
    },
    # TODO: Add rings
}

In [None]:
# TODO: Need to be enhanced by loop that checks all bond types from dictionary
pattern = Chem.MolFromSmarts("[C;X3;H2]=[C;X3;H1]-[C;X4;H2]")

for mol in mols_list:
    if mol.HasSubstructMatch(pattern):
        print("True")
    else:
        print("False")

False
