# Data Prep

Take a bunch of SMILES files and create numpy dataset for training (and manual testing).

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

from rdkit import Chem # conda install -c rdkit rdkit

from src.features.smiles import SmilesTokenizer, cleanup_list_smiles, encode_list_smiles

In [None]:
st = SmilesTokenizer()

df0 = pd.read_csv('data/raw/hydroxychloroquine.smi', names=["smiles"])
df1 = pd.read_csv('data/raw/dataset.smi', names=["smiles"])
df2 = pd.read_csv('data/raw/hiv_inhibitors.smi', names=["smiles"])
df3 = pd.read_csv('data/raw/known_TRPM8-inhibitors.smi', names=["smiles"])
df4 = pd.read_csv('data/raw/manual_testing.smi', names=["smiles"])

df_train = pd.concat([df0,
                      df1,
                      df2,
                      df3])

print('Training set (original):', len(df_train))

# limit training set to 200 character sequences
df_train = df_train.loc[df_train['smiles'].str.len() <= 200]

print('Training set (trimmed at 200 chars):', len(df_train))

df_test = df4

print('Test set:', len(df_test))

## Clean, encode, pad, export training set

In [None]:
def window_data(data):
    X = []
    for t in data:
        #print(t)
        for i in range(len(t) - 1):
            m = t[0:i+2]
            #print(m)
            X.append(m)
    return X

smiles = cleanup_list_smiles(list(df_train['smiles']))
encoded_smiles = encode_list_smiles(smiles)
#encoded_smiles = window_data(encoded_smiles)

encoded_smiles = window_data(encoded_smiles)

dataset_train = pad_sequences(encoded_smiles, maxlen=None, dtype='float32', padding='pre', value=st.zero())

print(dataset_train.shape)

np.save('data/interim/smiles_train.npy', dataset_train)

## Clean, encode, pad, export manual testing set

In [None]:
smiles = cleanup_list_smiles(list(df_test['smiles']))
encoded_smiles = encode_list_smiles(smiles)
encoded_smiles = window_data(encoded_smiles)

dataset_test = pad_sequences(encoded_smiles, maxlen=None, dtype='float32', padding='pre', value=st.zero())

print(dataset_test.shape)

np.save('data/interim/smiles_test.npy', dataset_test)