[RDKit](https://www.rdkit.org/ )
---

This is a popular package for cheminformatics calculations and manipulations. The [documentation](https://rdkit.org/docs/index.html) contains many examples, recipes, and links to other resources. A full history is given [here](https://rdkit.org/docs/Overview.html#open-source-toolkit-for-cheminformatics). Due to its popularity, many other open source tools used RDKit and related/compatible data structures.

The [RDKit book](https://rdkit.org/docs/RDKit_Book.html) explains conventions, approaches, and features.

These notes will focus on using rdkit in python and focus on only portions of the toolkit. The [cookbook](https://rdkit.org/docs/Cookbook.html) has a lot of recipes, many of which are used below. RDKit provides attribution for these original examples, which may be modified below.

In [None]:
!pip install rdkit

In [None]:
import rdkit 
rdkit.__version__

# The Basics

There are 2 basic languages used to describe molecules: [SMILES](https://www.daylight.com/dayhtml/doc/theory/theory.smiles.html) and [SMARTS](https://www.daylight.com/dayhtml/doc/theory/theory.smarts.html)


# Visualization

## Drawing a molecule

### Option 1: Map index to atom location directly

In [None]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.ipython_useSVG=True  #< set this to False if you want PNGs instead of SVGs

In [None]:
def mol_with_atom_index(mol):
    for atom in mol.GetAtoms():
        atom.SetAtomMapNum(atom.GetIdx())
    return mol

In [None]:
# Test in a kinase inhibitor
mol = Chem.MolFromSmiles("C1CC2=C3C(=CC=C2)C(=CN3C1)[C@H]4[C@@H](C(=O)NC4=O)C5=CNC6=CC=CC=C65")
# Default
mol

In [None]:
# With atom index
mol_with_atom_index(mol)

### Option 2: Using Jupyter display settings directly (better)

In [None]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.drawOptions.addAtomIndices = True
IPythonConsole.molSize = 300,300

In [None]:
mol = Chem.MolFromSmiles("C1CC2=C3C(=CC=C2)C(=CN3C1)[C@H]4[C@@H](C(=O)NC4=O)C5=CNC6=CC=CC=C65")
mol

In [None]:
IPythonConsole.drawOptions.addAtomIndices = False

## Display calculated features and annotations

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.molSize = 250,250

In [None]:
m = Chem.MolFromSmiles('c1ncncc1C(=O)[O-]')
AllChem.ComputeGasteigerCharges(m)
m

In [None]:
# The 'atomNote' property lets you add arbitrary annotations
m2 = Chem.Mol(m)
for at in m2.GetAtoms():
    lbl = '%.2f'%(at.GetDoubleProp("_GasteigerCharge"))
    at.SetProp('atomNote',lbl)
m2

In [None]:
# You can combine this with other annotations - by default they are separated by a comma
IPythonConsole.drawOptions.addAtomIndices = True
m2

In [None]:
IPythonConsole.drawOptions.addAtomIndices = False

## Stereochemistry

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addStereoAnnotation = True

In [None]:
# Default Representation uses legacy FindMolChiralCenters() code
m1 = Chem.MolFromSmiles('C1CC1[C@H](F)C1CCC1')
m2 = Chem.MolFromSmiles('F[C@H]1CC[C@H](O)CC1')
Draw.MolsToGridImage((m1,m2), subImgSize=(250,250))

In [None]:
# new stereochemistry code with more accurate CIP labels, 2020.09 release
from rdkit.Chem import rdCIPLabeler
rdCIPLabeler.AssignCIPLabels(m1)
rdCIPLabeler.AssignCIPLabels(m2)
Draw.MolsToGridImage((m1,m2), subImgSize=(250,250))

## Highlighting substructures

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.drawOptions.addAtomIndices = True

In [None]:
m = Chem.MolFromSmiles('c1cc(C(=O)O)c(OC(=O)C)cc1')
substructure = Chem.MolFromSmarts('C(=O)O')

In [None]:
Draw.MolsToGridImage((m, substructure))

In [None]:
print(m.GetSubstructMatches(substructure))
?m.GetSubstructMatches
# Returns tuples of the indices of the molecule's atoms that match a substructure query.

In [None]:
m

In [None]:
# you can also manually set the atoms that should be highlighted:
m.__sssAtoms = [0,1,2,6,11,12]
m

In [None]:
IPythonConsole.drawOptions.addAtomIndices = False

## Highlighting differences between molecules

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdFMCS
from rdkit.Chem.Draw import rdDepictor
rdDepictor.SetPreferCoordGen(True)
IPythonConsole.drawOptions.minFontSize=20

In [None]:
mol1 = Chem.MolFromSmiles('FC1=CC=C2C(=C1)C=NN2')
mol2 = Chem.MolFromSmiles('CCC1=C2NN=CC2=CC(Cl)=C1')

In [None]:
Draw.MolsToGridImage([mol1, mol2])

In [None]:
# This function will find the maximum common substructure
# See documentation for more details: https://rdkit.org/docs/GettingStartedInPython.html#maximum-common-substructure

def view_difference(mol1, mol2):
    mcs = rdFMCS.FindMCS([mol1,mol2])
    mcs_mol = Chem.MolFromSmarts(mcs.smartsString)

    match1 = mol1.GetSubstructMatch(mcs_mol)
    target_atm1 = []
    for atom in mol1.GetAtoms():
        if atom.GetIdx() not in match1:
            target_atm1.append(atom.GetIdx())

    match2 = mol2.GetSubstructMatch(mcs_mol)
    target_atm2 = []
    for atom in mol2.GetAtoms():
        if atom.GetIdx() not in match2:
            target_atm2.append(atom.GetIdx())

    return Draw.MolsToGridImage([mol1, mol2], highlightAtomLists=[target_atm1, target_atm2])

In [None]:
view_difference(mol1,mol2)

In [None]:
rdDepictor.SetPreferCoordGen(False)

## Functional group abbreviations

In [None]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import rdAbbreviations

In [None]:
m = Chem.MolFromSmiles('COc1ccc(C(=O)[O-])cc1')
m

In [None]:
abbrevs = rdAbbreviations.GetDefaultAbbreviations()
nm = rdAbbreviations.CondenseMolAbbreviations(m,abbrevs)
nm

In [None]:
# abbreviations that cover more than 40% of the molecule won't be applied by default
m = Chem.MolFromSmiles('c1c[nH]cc1C(F)(F)F')
nm1 = rdAbbreviations.CondenseMolAbbreviations(m,abbrevs)
nm2 = rdAbbreviations.CondenseMolAbbreviations(m,abbrevs,maxCoverage=0.8)
Draw.MolsToGridImage((m,nm1,nm2),legends=('','default','maxCoverage=0.8'))

In [None]:
# See available abbreviations
abbrevs = rdAbbreviations.GetDefaultAbbreviations()
for a in abbrevs:
    print(a.label)

## Representing large molecules

> "Some molecules like macrocycles are not represented well using the default RDKit drawing code. As a result, it may be preferable to use the CoordGen integration."

You can set this preference globally like:
~~~python
from rdkit.Chem.Draw import rdDepictor
rdDepictor.SetPreferCoordGen(True)
~~~

In [None]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.molSize = 350,300
from rdkit.Chem import Draw

In [None]:
# default drawing
mol = Chem.MolFromSmiles("C/C=C/CC(C)C(O)C1C(=O)NC(CC)C(=O)N(C)CC(=O)N(C)C(CC(C)C)C(=O)NC(C(C)C)C(=O)N(C)C(CC(C)C)C(=O)NC(C)C(=O)NC(C)C(=O)N(C)C(CC(C)C)C(=O)N(C)C(CC(C)C)C(=O)N(C)C(C(C)C)C(=O)N1C")
mol

In [None]:
from rdkit.Chem import rdCoordGen
rdCoordGen.AddCoords(mol)
mol

## Integrating with matplotlib

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

In [None]:
x = np.arange(0, 180, 1)
y = np.sin(x)

In [None]:
mol = Chem.MolFromSmiles('C1CNCCC1C(=O)C')
im = Chem.Draw.MolToImage(mol) # Convert to image

In [None]:
type(im)

In [None]:
fig = plt.figure(figsize=(10,5))
plt.plot(x, y)
plt.ylim(-1, 5)
ax = plt.axes([0.6, 0.47, 0.38, 0.38], frameon=True)
ax.imshow(im) # Use imshow to display
ax.axis('off')

# Identifying features

## Counting ring systems

# Manipulating molecules

DeepChem
---

https://deepchem.readthedocs.io/en/latest/index.html#