In this notebook we will train a deep learning model using all the data available !
* preprocessing : I encoded the smiles of all the train & test set and saved it [here](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset) , this may take up to 1 hour on TPU.
* Training & Inference : I used a simple 1dcnn model trained on 20 epochs.

How to improve :
* Try a different architecture : I'm able to get an LB score of 0.604 with minor changes on this architecture.
* Try another model like Transformer, or LSTM.
* Train for more epochs.
* Add more features like a one hot encoding of bb2 or bb3.
* And of course ensembling with GBDT models.

In [7]:
!pip install fastparquet -q

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import gc
import os
import pickle
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as APS

In [3]:
class CFG:

    PREPROCESS = False
    EPOCHS = 20
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 0.05

    NBR_FOLDS = 15
    SELECTED_FOLDS = [0]

    SEED = 2024

In [4]:
import tensorflow as tf
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

set_seeds(seed=CFG.SEED)

D0805 14:07:57.898085871      15 config.cc:119]                        gRPC EXPERIMENT tcp_frame_size_tuning               OFF (default:OFF)
D0805 14:07:57.898109521      15 config.cc:119]                        gRPC EXPERIMENT tcp_rcv_lowat                       OFF (default:OFF)
D0805 14:07:57.898113045      15 config.cc:119]                        gRPC EXPERIMENT peer_state_based_framing            OFF (default:OFF)
D0805 14:07:57.898115792      15 config.cc:119]                        gRPC EXPERIMENT flow_control_fixes                  ON  (default:ON)
D0805 14:07:57.898118184      15 config.cc:119]                        gRPC EXPERIMENT memory_pressure_controller          OFF (default:OFF)
D0805 14:07:57.898120764      15 config.cc:119]                        gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)
D0805 14:07:57.898123180      15 config.cc:119]                        gRPC EXPERIMENT new_hpack_huffman_decoder           ON  (default:ON)
D0805 14:07:57.

In [5]:
import tensorflow as tf

# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu="local") # "local" for 1VM TPU
    strategy = tf.distribute.TPUStrategy(tpu)
    print("Running on TPU")
    print("REPLICAS: ", strategy.num_replicas_in_sync)
except tf.errors.NotFoundError:
    print("Not on TPU")

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU

# Preprocessing

In [25]:
from tqdm import tqdm
import pandas as pd
import numpy as np

# train_raw = pd.read_parquet('/kaggle/input/leash-BELKA/train.parquet')
# smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
# assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
# assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
import pandas as pd

df = pd.read_csv('/kaggle/input/dataset/train.csv')
df2 = pd.read_csv('/kaggle/input/dataset/test.csv')
def encode_smile(smile):
    tmp = [enc[i] for i in smile]
    tmp = tmp + [0]*(145-len(tmp))
    return np.array(tmp).astype(np.uint8)


df = df[['Smiles', 'pIC50']]
smiles = df['Smiles'].values
smiles2 = df2['Smiles'].values
enc = {}
for i in df['Smiles']:
    for j in list(i):
        if enc.get(j, None) == None:
            enc[j] = len(enc) + 1
            
smiles_enc = [encode_smile(smile) for smile in tqdm(smiles)]
smiles_enc = np.stack(smiles_enc)
smiles_enc2 = [encode_smile(smile) for smile in tqdm(smiles2)]
smiles_enc2 = np.stack(smiles_enc2)
train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(145)])
test = pd.DataFrame(smiles_enc2, columns = [f'enc{i}' for i in range(145)])
train['pIC50'] = df['pIC50']

100%|██████████| 1952/1952 [00:00<00:00, 49255.10it/s]
100%|██████████| 113/113 [00:00<00:00, 47744.17it/s]


# Modeling

In [114]:
def my_model():
    def log_cosh_loss(y_true, y_pred):
        return tf.reduce_mean(tf.math.log(tf.math.cosh(y_pred - y_true)))
    
    with strategy.scope():
        INP_LEN = 145
        NUM_FILTERS = 32
        hidden_dim = 128

        inputs = tf.keras.layers.Input(shape=(INP_LEN,), dtype='int32')
        x = tf.keras.layers.Embedding(input_dim=len(enc), output_dim=hidden_dim, input_length=INP_LEN, mask_zero=True)(inputs)
        x = tf.keras.layers.Conv1D(filters=NUM_FILTERS, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
        x = tf.keras.layers.Conv1D(filters=NUM_FILTERS*2, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
        x = tf.keras.layers.Conv1D(filters=NUM_FILTERS*3, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
        x = tf.keras.layers.GlobalMaxPooling1D()(x)

        x = tf.keras.layers.Dense(1024, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Dense(1024, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)

        outputs = tf.keras.layers.Dense(1)(x)

        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        optimizer = tf.keras.optimizers.Adam(learning_rate=CFG.LR, weight_decay=CFG.WD)
        loss = 'mean_absolute_error'  # 회귀 문제에 적합한 손실 함수
        # mean_squared_logarithmic_error
        # mean_squared_error
        # log_cosh_loss
        # huber_loss
        # mean_absolute_error
        model.compile(
            loss=loss,
            optimizer=optimizer,
            metrics=['mean_absolute_error']  # 회귀 문제에 적합한 평가 지표
        )
        return model

# Train & Inference

In [107]:
import numpy as np

def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

def calculate_score(y_true, y_pred):
    y_true = pIC50_to_IC50(y_true)
    y_pred = pIC50_to_IC50(y_pred)
    
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    norm_rmse = rmse / np.ptp(y_true)  # np.ptp(y_true)는 y_true의 최대값 - 최소값
    
    def ic50_to_pic50(ic50):
        return -np.log10(ic50 * 1e-9)  # IC50(nM) -> IC50(M) 변환 후 log10

    # Correct Ratio (B) 계산
    y_true_pic50 = ic50_to_pic50(y_true)
    y_pred_pic50 = ic50_to_pic50(y_pred)
    correct = np.abs(y_true_pic50 - y_pred_pic50) <= 0.5
    ratio = np.mean(correct)
    
    # 최종 Score 계산
    score = 0.5 * (1 - min(norm_rmse, 1)) + 0.5 * ratio
    return score

In [115]:
from sklearn.model_selection import KFold

# KFold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)


all_preds = []
avg_score = 0
for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, TARGETS]
    X_val = train.loc[valid_idx, FEATURES]
    y_val = train.loc[valid_idx, TARGETS]

    es = tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_loss", mode='min', verbose=1)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(monitor='val_loss', filepath=f"model-{fold}.h5",
                                                        save_best_only=True, save_weights_only=True,
                                                    mode='min')
    reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=3, verbose=1)
    model = my_model()
    history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=20,
            callbacks=[checkpoint, reduce_lr_loss, es],
            batch_size=CFG.BATCH_SIZE,
            verbose=1,
        )
    model.load_weights(f"model-{fold}.h5")
    oof = model.predict(X_val, batch_size = 2*CFG.BATCH_SIZE)
    cv_score = calculate_score(y_val, oof)
    avg_score += cv_score
    print('fold :', fold, 'CV score =', cv_score)
    
    preds = model.predict(test, batch_size = 2*CFG.BATCH_SIZE)
    all_preds.append(preds)

preds = np.mean(all_preds, 0)

Epoch 1/20


2024-08-05 16:22:04.287378: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:04.483697: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.




2024-08-05 16:22:08.441740: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:08.563228: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 5.0000002374872565e-05.
Epoch 10/20
Epoch 11/20
Epoch 11: early stopping


2024-08-05 16:22:16.892767: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:16.966758: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


fold : 0 CV score = 0.6299192224897021
Epoch 1/20


2024-08-05 16:22:22.288014: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:22.424521: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.




2024-08-05 16:22:26.203811: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:26.309392: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 5.0000002374872565e-05.
Epoch 10/20
Epoch 11/20
Epoch 11: early stopping


2024-08-05 16:22:34.481718: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:34.552727: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


fold : 1 CV score = 0.5584891176702351
Epoch 1/20


2024-08-05 16:22:39.844781: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:39.979973: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.




2024-08-05 16:22:43.749816: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:22:43.848782: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 5.0000002374872565e-05.
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 19: ReduceLROnPlateau reducing learning rate to 2.5000001187436284e-06.
Epoch 20/20


2024-08-05 16:23:00.201765: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:23:00.306203: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


fold : 2 CV score = 0.6218833566197322
Epoch 1/20


2024-08-05 16:23:05.841535: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:23:05.999047: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.




2024-08-05 16:23:09.866162: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:23:09.973942: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 5.0000002374872565e-05.
Epoch 10/20
Epoch 11/20
Epoch 11: early stopping


2024-08-05 16:23:18.244611: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:23:18.323316: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


fold : 3 CV score = 0.5652009599195482
Epoch 1/20


2024-08-05 16:23:23.585648: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:23:23.718664: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.




2024-08-05 16:23:27.445886: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:23:27.544765: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 12: ReduceLROnPlateau reducing learning rate to 5.0000002374872565e-05.
Epoch 13/20
Epoch 14/20
Epoch 14: early stopping


2024-08-05 16:23:37.443530: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2024-08-05 16:23:37.513242: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


fold : 4 CV score = 0.6887225569780718


In [116]:
print(avg_score / 5)
# mse 0.5927130819266196 0.6423098446674433_fold5
# log_cosh_loss 0.5821399477622217
# mean_squared_logarithmic_error 0.5633181990480502
# huber_loss 0.576905494263006 0.6219190091656438_fold5
# mean_absolute_error 0.5704392483575251 0.6440441485596432_fold5

0.6128430427354579


In [117]:
sub = pd.read_csv('/kaggle/input/dataset/sample_submission.csv')
sub['IC50_nM'] = pIC50_to_IC50(preds)
sub.to_csv('sub.csv', index=False)
sub

Unnamed: 0,ID,IC50_nM
0,TEST_000,247.044418
1,TEST_001,258.041046
2,TEST_002,140.176926
3,TEST_003,210.592743
4,TEST_004,234.703018
...,...,...
108,TEST_108,251.611298
109,TEST_109,187.687195
110,TEST_110,43.605057
111,TEST_111,217.942398


# Submission