# Data Prep

Load molecules.

In [None]:
%pip install numpy
%pip install pandas
%pip install tensorflow
%pip install tensorboard
%pip install tensorrt
%pip install matplotlib
%pip install rdkit
%pip install openbabel

In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.sequence import pad_sequences

from src.features.smiles import SmilesTokenizer, cleanup_list_smiles, encode_list_smiles

In [None]:
MAX_LENGTH = 200

dataset = pd.read_csv("data/external/dataset.smi", names=["smiles"])
hiv_inhibitors = pd.read_csv("data/external/hiv_inhibitors.smi", names=["smiles"])
hydroxychloroquine = pd.read_csv("data/external/hydroxychloroquine.smi", names=["smiles"])
known_TRPM8_inhibitors = pd.read_csv("data/external/known_TRPM8-inhibitors.smi", names=["smiles"])
manual_testing = pd.read_csv("data/external/manual_testing.smi", names=["smiles"])

df_train = pd.concat([dataset, hiv_inhibitors, known_TRPM8_inhibitors])

df_train = df_train.loc[
    (df_train['smiles'].str.len() <= MAX_LENGTH)
]

display(df_train)

df_test = manual_testing

df_test = df_test.loc[
    (df_test['smiles'].str.len() <= MAX_LENGTH)
]

display(df_test)

In [None]:
# def window_data(data):
#     X = []
#     for t in data:
#         #print(t)
#         for i in range(len(t) - 1):
#             m = t[0:i+2]
#             #print(m)
#             X.append(m)
#     return X

# smiles = cleanup_list_smiles(list(df_train['smiles']))
# encoded_smiles = encode_list_smiles(smiles)

# encoded_smiles = window_data(encoded_smiles)

In [None]:
st = SmilesTokenizer()

encoded_smiles = []

for s in df_train['smiles']:
    t = st.tokenize(s)
    e = st.embeddings(t)
    encoded_smiles.append(e)

dataset = pad_sequences(encoded_smiles, maxlen=MAX_LENGTH, dtype='float32', padding='pre', value=st.zero())
print(dataset.shape)

np.save('data/interim/training_dataset.npy', dataset)

encoded_smiles = []

for s in df_test['smiles']:
    t = st.tokenize(s)
    e = st.embeddings(t)
    encoded_smiles.append(e)

dataset = pad_sequences(encoded_smiles, maxlen=MAX_LENGTH, dtype='float32', padding='pre', value=st.zero())
print(dataset.shape)

np.save('data/interim/test_dataset.npy', dataset)