
Was inspired by notebook:  [BELKA 1DCNN Starter with all data ](https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data) 

and paper: [Convolutional neural network based on SMILES representation of compounds for detecting chemical motif](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2523-5)

# Encoding

In [221]:
import pandas as pd
import numpy as np

import joblib
from tqdm import tqdm

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split



In [222]:
import tensorflow as tf

try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

Number of accelerators:  1


Importing only a part of the rows, because taking the whole dataset eats up all the RAM. 

In [223]:
train_raw = pd.read_csv('/kaggle/input/leash-BELKA/train.csv', nrows = 120000)

In [224]:
train_raw.head()
# BRD4	 HSA	 sEH

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,0,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,BRD4,0
1,1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,HSA,0
2,2,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,sEH,0
3,3,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,BRD4,0
4,4,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,HSA,0


We exctract molecule_smiles w.r.t. single protein_name. 

In [225]:
smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values

assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0

In [226]:
enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}

In [227]:
def encode_smile(smile, max_len=142):
# Loop trough all chars in passed smile and take integer 
# corresponding to the char
    encoded = [enc[char] for char in smile]

# Pad zeros if the encoded string is shorter than 142 
    encoded += [0] * (max_len - len(encoded))  
    return encoded

In [228]:
encoded_smiles = joblib.Parallel(n_jobs=-2)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
encoded_smiles = np.array(encoded_smiles)

data_train = pd.DataFrame(encoded_smiles, columns =[f'enc{i}' for i in range(142)])

  pid = os.fork()
100%|██████████| 40/40 [00:00<00:00, 60.75it/s]


In [229]:
data_train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
data_train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
data_train['bind3']  = train_raw[train_raw['protein_name']=='sEH']['binds'].values

# Model

In [230]:
def make_model():
    with strategy.scope():
        # 
        inputs = tf.keras.Input(shape=(142,), dtype = 'int32')
        x = tf.keras.layers.Embedding(input_dim=36, output_dim=128,
                                    input_length=142, mask_zero = True)(inputs)


        # 1st Convolutional Layer
        x = Conv1D(filters=32, kernel_size=51, strides=1, 
                   padding= 'same', activation='relu')(x)

        x = MaxPooling1D(pool_size=51, strides=1, padding='same')(x)
        
        # 2nd Convolutional Layer
        x = Conv1D(filters=64, kernel_size=51, strides=1,
                   padding='same', activation='relu')(x)
        
        x = MaxPooling1D( pool_size=51, strides=1, padding='same',)(x)
        
        x = GlobalMaxPooling1D()(x)
        
        x = tf.keras.layers.Dense(64, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.15)(x)
        x = tf.keras.layers.Dense(64, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.15)(x)
        x = tf.keras.layers.Dense(32, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.15)(x)

        
        outputs = Dense(3, activation='softmax')(x)
        
        
        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,weight_decay=0.01),
                  loss='binary_crossentropy' ,
                  metrics=[tf.keras.metrics.Precision()])
    
    return model


model = make_model()



In [231]:
X_cols = [f'enc{i}' for i in range(142)]
y_cols = ['bind1', 'bind2', 'bind3']

X_train, X_test, y_train, y_test = train_test_split(data_train[X_cols], data_train[y_cols], test_size=0.2, random_state=42)

In [232]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=2048,validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/1000




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - loss: 0.6942 - precision_10: 0.0000e+00



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 0.6942 - precision_10: 0.0000e+00 - val_loss: 0.6341 - val_precision_10: 0.0000e+00
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 812ms/step - loss: 0.6403 - precision_10: 0.0000e+00 - val_loss: 0.5109 - val_precision_10: 0.0000e+00
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 379ms/step - loss: 0.5308 - precision_10: 0.0000e+00 - val_loss: 0.3253 - val_precision_10: 0.0000e+00
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 619ms/step - loss: 0.3568 - precision_10: 0.0000e+00 - val_loss: 0.1351 - val_precision_10: 0.0000e+00
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step - loss: 0.2365 - precision_10: 0.0000e+00 - val_loss: 0.0325 - val_precision_10: 0.0000e+00
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 618ms/step - loss: 0.0941 - precision_10: 0.0000e+0

In [234]:
test_raw = pd.read_csv('/kaggle/input/leash-BELKA/test.csv')
smiles = test_raw['molecule_smiles'].values



encoded_smiles = joblib.Parallel(n_jobs=-2)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
encoded_smiles = np.array(encoded_smiles)

test = pd.DataFrame(encoded_smiles, columns =[f'enc{i}' for i in range(142)])


  pid = os.fork()

  0%|          | 3/1674896 [00:00<37:15:07, 12.49it/s][A
  0%|          | 6/1674896 [00:00<43:48:54, 10.62it/s][A
  0%|          | 3075/1674896 [00:00<04:00, 6957.92it/s][A
  1%|          | 9219/1674896 [00:00<01:23, 19944.29it/s][A
  1%|          | 18435/1674896 [00:00<00:46, 35602.33it/s][A
  2%|▏         | 25404/1674896 [00:01<00:37, 43878.95it/s][A
  2%|▏         | 36867/1674896 [00:01<00:28, 56720.89it/s][A
  3%|▎         | 49155/1674896 [00:01<00:25, 62577.97it/s][A
  4%|▎         | 61443/1674896 [00:01<00:27, 58991.66it/s][A
  4%|▍         | 73731/1674896 [00:01<00:27, 58657.67it/s][A
  5%|▌         | 86019/1674896 [00:01<00:27, 58784.12it/s][A
  6%|▌         | 98307/1674896 [00:02<00:27, 58144.57it/s][A
  7%|▋         | 110595/1674896 [00:02<00:26, 58107.87it/s][A
  7%|▋         | 122883/1674896 [00:02<00:26, 57913.09it/s][A
  8%|▊         | 135171/1674896 [00:02<00:27, 56813.69it/s][A
  9%|▉         | 147459/1674896 [00:03<00:26, 57130.12it/s

In [235]:
with strategy.scope():
    prediction = model.predict(test, batch_size = 4096 )
    
    



[1m 11/818[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m25:53[0m 2s/step

 59%|█████▉    | 989357/1674896 [03:14<02:15, 5077.18it/s] 


KeyboardInterrupt: 

In [None]:
test = pd.read_csv('/kaggle/input/leash-BELKA/test.csv')
test['binds'] = 0
test.loc[test['protein_name']=='BRD4', 'binds'] = prediction[(test['protein_name']=='BRD4').values, 0]
test.loc[test['protein_name']=='HSA', 'binds'] = prediction[(test['protein_name']=='HSA').values, 1]
test.loc[test['protein_name']=='sEH', 'binds'] = prediction[(test['protein_name']=='sEH').values, 2]
test[['id', 'binds']].to_csv('submission.csv', index = False)