# SMILearn vs TOX21
Evaluation of toxicity prediction models based on SMILearn vectors

## Prerequisites

### Installations (Google Colab)
This is an example of installation steps that allow this notebook to work properly in Google Colab environment.  
In other cases, it is recommended to follow the instructions in `README.md` file and skip the cell below.

In [None]:
# Save the output as `installation_log` variable
%%capture installation_log

# Install Miniconda
!wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!bash Miniconda3-latest-Linux-x86_64.sh -bfp /usr/local
!rm Miniconda3-latest-Linux-x86_64.sh

# Install RDKit
!conda install -qyc conda-forge rdkit

# Install DeepSMILES unofficial GitHub fork
!pip install git+git://github.com/mateuszrezler/deepsmiles@master

# Update system path
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages')

### Imports

In [None]:
import deepsmiles
import numpy as np
import pandas as pd
import re
from rdkit import Chem
from rdkit.Chem import rdchem

## Classes and functions

In [None]:
class SMIList(list):

    """SMILES as list of rdchem.Atom objects and other chars."""

    @property
    def is_deep(self):

        return self.__is_deep

    @is_deep.setter
    def is_deep(self, value):

        self.__is_deep = value
        self.__build()

    @property
    def mol(self):

        return self.__mol

    @mol.setter
    def mol(self, value):

        self.__mol = mol_str_filter(value)
        self.__build()

    @property
    def properties(self):

        return self.__properties

    @properties.setter
    def properties(self, value):

        self.__properties = value
        self.__build()

    @property
    def str(self):

        return self.__str__()

    def __init__(self, value, is_deep=False, **properties):

        self.__mol = mol_str_filter(value)
        self.__is_deep = is_deep
        self.__properties = properties
        self.__build()

    def __repr__(self):

        return str([f'<Atom {component.GetSymbol()}>'
                    if type(component) == rdchem.Atom
                    else component for component in self])

    def __str__(self):

        return deep_filter(self.__is_deep, Chem.MolToSmiles(
            self.__mol, **self.__properties))

    def __build(self):

        # self.clear()
        atoms = [rdchem.Mol.GetAtomWithIdx(self.__mol, atom_index)
                 for atom_index in range(self.__mol.GetNumAtoms())]
        explicit = deep_filter(self.__is_deep, Chem.MolToSmiles(
            self.__mol, allBondsExplicit=True, allHsExplicit=True))
        if self.__is_deep:
            parenthesis_pattern = r'\)+'
        else:
            parenthesis_pattern = r'[\(\)]'
        pattern = r'(?:\[.*?\])|[\-/\\:=#\d]|%\d{2}|' + parenthesis_pattern
        components = re.findall(pattern, explicit)
        atom_index = 0

        for component in components:

            if component[0] == '[':
                self.append(atoms[atom_index])
                atom_index += 1

            else:
                self.append(component)

    def to_vec(self, *feature_list):

        """Convert SMIList to customizable feature vector (numpy.ndarray)."""

        atom_index = 0
        ring_info = self.__mol.GetRingInfo().AtomRings()

        for component_index, component in enumerate(self):

            if type(component) == rdchem.Atom:
                atom = component
                smallest_rings = [len(atom_tuple) for atom_tuple in ring_info
                                  if atom_index in atom_tuple]
                atom_features = np.array([feature(atom, smallest_rings)
                                          for feature in feature_list])
                all_features = np.hstack((atom_features,
                                          np.zeros(2, dtype=int)))
                atom_index += 1

            elif component[0] in ('-', '/', '\\', ':', '=', '#', '.'):
                all_features = np.hstack((np.array([component]),
                                          np.zeros(len(feature_list)+1,
                                                   dtype=int)))

            elif component[0] == ')':
                all_features = np.hstack((np.zeros(len(feature_list),
                                                   dtype=int),
                                          np.array([len(component)]),
                                          np.zeros(1, dtype=int)))

            elif component[0].isdigit() or component[0] == '%':

                if component[0] == '%':
                    component = component[1:]

                all_features = np.hstack((np.zeros(len(feature_list)+1,
                                                   dtype=int),
                                          np.array([component])))

            if component_index == 0:
                feature_vector = all_features

            else:
                feature_vector = np.vstack((feature_vector, all_features))

        return feature_vector


def symbol(atom, *_):

    """Return symbol of atom and apply lower() when atom is aromatic."""

    if atom.GetIsAromatic():

        return atom.GetSymbol().lower()

    return atom.GetSymbol()


def atomic_num(atom, *_):

    """Return atomic number as int."""

    return atom.GetAtomicNum()


def aromatic(atom, *_):

    """Return 1 when atom is aromatic or 0 if it is not."""

    return int(atom.GetIsAromatic())


def chiral(atom, *_):

    """Return chiral tag as int."""

    return int(atom.GetChiralTag())


def D(atom, *_):

    """Return D (explicit degree) as int."""

    return atom.GetDegree()


def H(atom, *_):

    """Return H (total number of hydrogens) as int."""

    return atom.GetTotalNumHs()


def R(atom, smallest_rings):

    """Return R (ring membership) as int."""

    return len(smallest_rings)


def r(atom, smallest_rings):

    """Return r (smallest ring size) as int."""

    if len(smallest_rings) == 0:

        return 0

    return min(smallest_rings)


def v(atom, *_):

    """Return v (total valence) as int."""

    return atom.GetTotalValence()


def X(atom, *_):

    """Return X (total connections) as int."""

    return atom.GetTotalDegree()


def charge(atom, *_):

    """Return formal charge as int."""

    return atom.GetFormalCharge()


def deep_filter(is_deep, smiles):

    """Convert SMILES to DeepSMILES if `is_deep == True` or do nothing."""

    if is_deep:
        converter = deepsmiles.Converter(branches=True, rings=True)

        return converter.encode(smiles)

    return smiles


def mol_str_filter(value):

    """Convert SMILES to rdchem.Mol or do nothing."""

    if type(value) == str:

        return Chem.MolFromSmiles(value)

    elif type(value) == rdchem.Mol:

        return value


# test
smi = SMIList('O[C@H]1[C@H](C(C)C)CC[C@@H](C)C1', is_deep=True)
smi