# SMILearn vs TOX21
Evaluation of toxicity prediction models based on SMILearn vectors

## Prerequisites

### Installations (Google Colab)
This is an example of installation steps that allow this notebook to work properly in Google Colab environment.  
In other cases, it is recommended to follow the instructions in `README.md` file and skip the cell below.

In [None]:
# Save the output as `installation_log` variable
%%capture installation_log

# Install Miniconda
!wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!bash Miniconda3-latest-Linux-x86_64.sh -bfp /usr/local
!rm Miniconda3-latest-Linux-x86_64.sh

# Install RDKit
!conda install -qyc conda-forge rdkit

# Install DeepSMILES unofficial GitHub fork
!pip install git+git://github.com/mateuszrezler/deepsmiles@master

# Update system path
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages')

### Imports

In [None]:
import deepsmiles
import numpy as np
import pandas as pd
import re
from rdkit import Chem
from rdkit.Chem import rdchem

## Classes and functions

In [None]:
class SMIList(list):

    """SMILES as list of rdchem.Atom objects and other chars."""

    def __init__(self, value, is_deep=False, **properties):

        self.__mol = mol_str_filter(value)
        self.__is_deep = is_deep
        self.__properties = properties
        self.__build()

    def __repr__(self):

        return str([f'<Atom {component.GetSymbol()}>'
                    if type(component) == rdchem.Atom
                    else component for component in self])

    def __str__(self):

        return deep_filter(self.__is_deep, Chem.MolToSmiles(
            self.__mol, **self.__properties))

    def __build(self):

        # self.clear()
        atoms = [rdchem.Mol.GetAtomWithIdx(self.__mol, atom_index)
                 for atom_index in range(self.__mol.GetNumAtoms())]
        explicit = deep_filter(self.__is_deep, Chem.MolToSmiles(
            self.__mol, allBondsExplicit=True, allHsExplicit=True))
        if self.__is_deep:
            parenthesis_pattern = r'\)+'
        else:
            parenthesis_pattern = r'[\(\)]'
        pattern = r'(?:\[.*?\])|[\-/\\:=#\d]|%\d{2}|' + parenthesis_pattern
        components = re.findall(pattern, explicit)
        atom_index = 0

        for component in components:

            if component[0] == '[':
                self.append(atoms[atom_index])
                atom_index += 1

            else:
                self.append(component)


def deep_filter(is_deep, smiles):

    """Convert SMILES to DeepSMILES if `is_deep == True` or do nothing."""

    if is_deep:
        converter = deepsmiles.Converter(branches=True, rings=True)

        return converter.encode(smiles)

    return smiles


def mol_str_filter(value):

    """Convert SMILES to rdchem.Mol or do nothing."""

    if type(value) == str:

        return Chem.MolFromSmiles(value)

    elif type(value) == rdchem.Mol:

        return value


# test
smi = SMIList('O[C@H]1[C@H](C(C)C)CC[C@@H](C)C1', is_deep=True)
smi