
Was inspired by notebook:  [BELKA 1DCNN Starter with all data ](https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data) 

and paper: [Convolutional neural network based on SMILES representation of compounds for detecting chemical motif](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2523-5)

# Encoding

In [None]:
import pandas as pd
import numpy as np

import joblib
from tqdm import tqdm

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Layer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as APS


In [None]:
import tensorflow as tf

try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

Importing only a part of the rows, because taking the whole dataset eats up all the RAM. 

In [None]:
train_raw = pd.read_csv('/kaggle/input/leash-BELKA/train.csv', nrows = 2400000)

We exctract molecule_smiles w.r.t. single protein_name. 

In [None]:
smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values


In [None]:
enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}

In [None]:
def encode_smile(smile, max_len=142):
# Loop trough all chars in passed smile and take integer 
# corresponding to the char
    encoded = [enc[char] for char in smile]

# Pad zeros if the encoded string is shorter than 142 
    encoded += [0] * (max_len - len(encoded))  
    return encoded

In [None]:
encoded_smiles = joblib.Parallel(n_jobs=-2)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
encoded_smiles = np.array(encoded_smiles)

data_train = pd.DataFrame(encoded_smiles, columns =[f'enc{i}' for i in range(142)])

In [None]:
data_train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
data_train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
data_train['bind3']  = train_raw[train_raw['protein_name']=='sEH']['binds'].values

In [None]:

test_raw = pd.read_csv('/kaggle/input/leash-BELKA/test.csv')
smiles = test_raw['molecule_smiles'].values



encoded_smiles = joblib.Parallel(n_jobs=-2)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
encoded_smiles = np.array(encoded_smiles)

test = pd.DataFrame(encoded_smiles, columns =[f'enc{i}' for i in range(142)])

# Model

In [None]:
class RoundLayer(Layer):
    def call(self, inputs):
        return tf.round(inputs)


def make_model():
    with strategy.scope():
        # 
        inputs = tf.keras.Input(shape=(142,), dtype = 'int32')
        x = tf.keras.layers.Embedding(input_dim=36, output_dim=128,
                                    input_length=142, mask_zero = True)(inputs)


        x = Conv1D(filters=32, kernel_size=3, strides=1, 
                   padding= 'valid', activation='relu')(x)

        #x = AveragePooling1D(pool_size=51, strides=1, padding='same')(x)
        

        x = Conv1D(filters=64, kernel_size=3, strides=1,
                   padding='valid', activation='relu')(x)
        
        #x = AveragePooling1D(pool_size=5, strides=1, padding='same')(x)
        
        x = Conv1D(filters=96, kernel_size=3, strides=1,
                   padding='same', activation='relu')(x)
        
        #x = MaxPooling1D( pool_size=51, strides=3, padding='same',)(x)
        
        x = GlobalMaxPooling1D()(x)
        
        x = tf.keras.layers.Dense(1024, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Dense(1024, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)

        
        outputs = tf.keras.layers.Dense(3, activation='sigmoid')(x)  # Assuming binary classification
       
        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        
        
        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,weight_decay=0.01),
                  loss='binary_crossentropy' ,
                  metrics=[tf.keras.metrics.AUC(curve='PR', name = 'avg_precision')])
    
        return model


model = make_model()

In [None]:
X_cols = [f'enc{i}' for i in range(142)]
y_cols = ['bind1', 'bind2', 'bind3']

early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_loss", mode='min', verbose=1)

skf = StratifiedKFold(n_splits=7 ,shuffle = True, random_state = 42)
all_preds = []

For training, the data is divided in 7 folds, model is trained for each. Then for each model, a prediction is made and then at the end, and average between all 7 predicitions is made. 

In [None]:
for fold,(train_id, test_id) in enumerate(skf.split(data_train, data_train[y_cols].sum(1))):
    
    X_train = data_train.loc[train_id, X_cols]
    y_train = data_train.loc[train_id, y_cols]
    X_val = data_train.loc[test_id, X_cols]
    y_val = data_train.loc[test_id, y_cols]
    
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        monitor='val_loss', filepath=f"/kaggle/working/model_{fold}.weights.h5",
        save_best_only=True, save_weights_only=True,
        mode='min')

    reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.05, patience=5, verbose=1)
    model = make_model()
    history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=35,
            callbacks=[checkpoint, reduce_lr_loss, early_stopping],
            batch_size=4096,
            verbose=1,
        )
    
    # Load the best weight and use them for prediction
    model.load_weights(f"/kaggle/working/model_{fold}.weights.h5")
    oof = model.predict(X_val, batch_size = 8192)
    print('fold :', fold, 'CV score =', APS(y_val, oof, average = 'micro'))
    
    preds = model.predict(test, batch_size = 8192)
    all_preds.append(preds)

In [None]:
prediction = np.round (np.mean(all_preds, 0))

In [None]:
test = pd.read_csv('/kaggle/input/leash-BELKA/test.csv')
test['binds'] = 0
test.loc[test['protein_name']=='BRD4', 'binds'] = prediction[(test['protein_name']=='BRD4').values, 0]
test.loc[test['protein_name']=='HSA', 'binds'] = prediction[(test['protein_name']=='HSA').values, 1]
test.loc[test['protein_name']=='sEH', 'binds'] = prediction[(test['protein_name']=='sEH').values, 2]
test[['id', 'binds']].to_csv('submission.csv', index = False)