# SMILearn
Feature engineering module for machine learning models based on SMILES representaion of chemical compounds.  
This is a demo of customizable SMILearn featurizer capabilities together with an example of use in benchmark machine learning task.

## Prerequisites

### Installations (Google Colab only)

In [None]:
%%time
try:
    import google.colab
    !git clone -q https://github.com/mateuszrezler/smilearn.git
    %cd -q smilearn
    from src import colab_setup
except ModuleNotFoundError:
    pass

In [None]:
# reference featurizer
data = read_csv('datasets/tox21.csv')
X_raw = DataFrame(data['smiles'])
y_raw = DataFrame(data['NR-AR'])
af = [has_symbol('H'),
      has_symbol('C'),
      has_symbol('O'),
      has_symbol('N'),
      has_no_symbols('HCON'),
      get_num_hs(maxval=8),
      get_degree(maxval=4),
      get_charge(maxval=8),
      get_valence(maxval=8),
      is_in_ring(),
      is_aromatic(),
      lambda mol, index: 0,
      has_chiral_tag('CHI_TETRAHEDRAL_CW'),
      has_chiral_tag('CHI_TETRAHEDRAL_CCW'),
      lambda mol, index: 0,
      has_hybridization('S'),
      has_hybridization('SP'),
      has_hybridization('SP2'),
      has_hybridization('SP3'),
      has_hybridization('SP3D'),
      has_hybridization('SP3D2')]
sf = [is_char(char) for char in '()[].:=#\\/@+-234567<>']
Xpipeline = Pipeline(
    [
     ('rebuild_smiles',
      SmilesRebuilder(columns=['smiles'],
                      kekuleSmiles=False,
                      isomericSmiles=True)),
     ('build_isomer_smiles',
      SmilesRebuilder(columns=['smiles'],
                      kekuleSmiles=True,
                      isomericSmiles=True,
                      save_as=['isomer_smiles'])),
     ('insert_ring_tags',
      RingTagInserter(columns=['isomer_smiles'])),
     ('tokenize_smiles',
      RegexTokenizer(columns=['isomer_smiles'],
                     regex=r'.')),
     ('vectorize_smiles',
      SmilesVectorizer(smiles_column='smiles',
                       tokens_column='isomer_smiles',
                       ignore_regex=r'[a-z]',
                       atom_regex=r'[A-Z]',
                       atom_functions=af,
                       struct_functions=sf,
                       max_len=400,
                       h_vector=True))
    ],
    verbose=True
)
ypipeline = Pipeline(
    [
     ('fill_nans',
      NanFiller()),
     ('convert_to_array',
      ToArrayConverter())
    ],
    verbose=True
)
X = Xpipeline.fit_transform(X_raw)
y = ypipeline.fit_transform(y_raw)