# Leash Bio

- [DataSet](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset)
- 分子全体のsmilesをembeddingしたものを特徴料で使用
- simple 1dcnn model trained on 20 epochs.

## ref
- https://www.kaggle.com/code/yyyu54/pytorch-version-belka-1dcnn-starter-with-all-data
- https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data/notebook

- Notes: the embedding layer in pytorch is different than tensorflow, in which it doesn't have the mask_zero option, so I had to change the num of embedding to 37 to make it work. Please let me know if there's a better way to implement it!

In [1]:
exp_no = '001'
DEBUG = False

In [2]:
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from sklearn.metrics import average_precision_score as APS
from sklearn.model_selection import StratifiedKFold
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

In [4]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / f"output/exp{exp_no}"
    
# set device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():    
    device = "cuda"
else:
    device = "cpu"
    
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print('Using', torch.cuda.device_count(), 'GPU(s)')
print('pytorch:', torch.__version__)

Using 1 GPU(s)
pytorch: 2.0.0


In [5]:
class CFG:
    PREPROCESS = False
    EPOCHS = 30 #20
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 0.05
    NBR_FOLDS = 15
    SELECTED_FOLDS = [0]
    SEED = 2024

class paths:    
    DATA_DIR = DATA_DIR
    TRAIN_PATH = DATA_DIR / "train.parquet"
    TEST_PATH = DATA_DIR / "test.parquet"
    OUTPUT_DIR = OUTPUT_DIR
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [6]:
# import tensorflow as tf
import torch
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

set_seeds(seed=CFG.SEED)

# Preprocessing

In [7]:
if CFG.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('/kaggle/input/leash-BELKA/train.parquet')
    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('train_enc.parquet')

    test_raw = pd.read_parquet('/kaggle/input/leash-BELKA/test.parquet')
    smiles = test_raw['molecule_smiles'].values

    smiles_enc = joblib.Parallel(n_jobs=96)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    test = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    test.to_parquet('test_enc.parquet')

else:
    train = pd.read_parquet(paths.DATA_DIR / 'belka-enc-dataset/train_enc.parquet')
    test = pd.read_parquet(paths.DATA_DIR / 'belka-enc-dataset/test_enc.parquet')
    
    
if DEBUG:
    train = train.sample(100000).reset_index(drop=True)
    test = test.sample(100000).reset_index(drop=True)

# Modeling

In [8]:
class MyModel(pl.LightningModule):
    def __init__(self, input_dim=142, input_dim_embedding=37, hidden_dim=128, num_filters=32, output_dim=3, lr=1e-3, weight_decay=1e-6):
        super(MyModel, self).__init__()
        self.save_hyperparameters()

        self.embedding = nn.Embedding(num_embeddings=self.hparams.input_dim_embedding, embedding_dim=self.hparams.hidden_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(in_channels=self.hparams.hidden_dim, out_channels=self.hparams.num_filters, kernel_size=3, stride=1, padding=0)
        self.conv2 = nn.Conv1d(in_channels=self.hparams.num_filters, out_channels=self.hparams.num_filters*2, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv1d(in_channels=self.hparams.num_filters*2, out_channels=self.hparams.num_filters*3, kernel_size=3, stride=1, padding=0)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(self.hparams.num_filters*3, 1024)
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, self.hparams.output_dim)

    def forward(self, x):
        x = self.embedding(x).permute(0,2,1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.global_max_pool(x).squeeze(2)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.output(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        return optimizer

# Train & Inference

In [9]:
def predict_data_in_batches(model, df, FEATURES, batch_size, device):
    
    model.to(device)
    model.eval()
    
    test_dataset = TensorDataset(torch.tensor(df[FEATURES].values, dtype=torch.int))
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    all_preds = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch[0].to(device)
            preds = model(inputs)
            all_preds.append(preds.cpu().numpy())
    
    return np.concatenate(all_preds, axis=0)

In [None]:
FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']

skf = StratifiedKFold(n_splits=CFG.NBR_FOLDS, shuffle=True, random_state=42)
all_preds = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[TARGETS].sum(1))):

    # Convert pandas dataframes to PyTorch tensors
    X_train = torch.tensor(train.loc[train_idx, FEATURES].values, dtype=torch.int)
    y_train = torch.tensor(train.loc[train_idx, TARGETS].values, dtype=torch.float16)
    X_val = torch.tensor(train.loc[valid_idx, FEATURES].values, dtype=torch.int)
    y_val = torch.tensor(train.loc[valid_idx, TARGETS].values, dtype=torch.float16)
    
    # Create TensorDatasets
    train_dataset = TensorDataset(X_train, y_train)
    valid_dataset = TensorDataset(X_val, y_val)
    
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=8)
    valid_loader = DataLoader(valid_dataset, batch_size=CFG.BATCH_SIZE, num_workers=8)
        
    model = MyModel(lr=CFG.LR, weight_decay=CFG.WD)

    early_stop_callback = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=True)
    checkpoint_callback = ModelCheckpoint(monitor="val_loss",
                                          dirpath=paths.OUTPUT_DIR,
                                          filename=f"model-{fold}",
                                          save_top_k=1,
                                          mode="min")
    lr_monitor = LearningRateMonitor(logging_interval='epoch')

    trainer = pl.Trainer(
        max_epochs=CFG.EPOCHS,
        callbacks=[early_stop_callback, checkpoint_callback, lr_monitor],
        devices=1,
        accelerator = device,  # Adjust based on your hardware
        enable_progress_bar=True,
    )

    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

    model = MyModel.load_from_checkpoint(checkpoint_callback.best_model_path)
    
    oof = predict_data_in_batches(model, train.loc[valid_idx, FEATURES], FEATURES, CFG.BATCH_SIZE, device)
    print('fold :', fold, 'CV score =', APS(y_val.numpy(), oof, average='micro'))
    
    # pred for test data
    preds = predict_data_in_batches(model, test, FEATURES, CFG.BATCH_SIZE, device)
    all_preds.append(preds)

preds = np.mean(all_preds, 0)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/working/output/exp001 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params
------------------------------------------------------
0 | embedding       | Embedding         | 4.7 K 
1 | conv1           | Conv1d            | 12.3 K
2 | conv2           | Conv1d            | 6.2 K 
3 | conv3           | Conv1d            | 18.5 K
4 | global_max_pool | AdaptiveMaxPool1d | 0     
5 | fc1             | Linear            | 99.3 K
6 | dropout         | Dropout           | 0     
7 | fc2             | Linear            | 1.0 M 
8 | fc3             | Linear            | 524 K 
9 | output          | Linear            | 1.5 K 
--------------------------------------------

Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 22426/22426 [07:59<00:00, 46.74it/s, v_num=14]
Validation: |                                                                                                                                   | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                                                                                                            | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                               | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                      | 1/1602 [00:00<00:14, 113.60it/s][A
Validation DataLoader 0:   0%|▏                                                                                                     | 2/1602 [00:00<00:11

Metric val_loss improved. New best score: 0.044


Epoch 1: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 22426/22426 [08:12<00:00, 45.55it/s, v_num=14]
Validation: |                                                                                                                                   | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                                                                                                            | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                               | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                      | 1/1602 [00:00<00:09, 161.53it/s][A
Validation DataLoader 0:   0%|▏                                                                                                     | 2/1602 [00:00<00:11

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.043


Epoch 2: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 22426/22426 [08:14<00:00, 45.38it/s, v_num=14]
Validation: |                                                                                                                                   | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                                                                                                            | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                               | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                      | 1/1602 [00:00<00:11, 133.44it/s][A
Validation DataLoader 0:   0%|▏                                                                                                     | 2/1602 [00:00<00:10

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.042


Epoch 4: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 22426/22426 [08:13<00:00, 45.47it/s, v_num=14]
Validation: |                                                                                                                                   | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                                                                                                            | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                               | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                      | 1/1602 [00:00<00:10, 150.26it/s][A
Validation DataLoader 0:   0%|▏                                                                                                     | 2/1602 [00:00<00:10

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.040


Epoch 5: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 22426/22426 [08:14<00:00, 45.39it/s, v_num=14]
Validation: |                                                                                                                                   | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                                                                                                            | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                               | 0/1602 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                      | 1/1602 [00:00<00:13, 117.86it/s][A
Validation DataLoader 0:   0%|▏                                                                                                     | 2/1602 [00:00<00:11

# Submission

In [None]:
tst = pd.read_parquet(DATA_DIR / 'test.parquet')
tst['binds'] = 0
tst.loc[tst['protein_name']=='BRD4', 'binds'] = preds[(tst['protein_name']=='BRD4').values, 0]
tst.loc[tst['protein_name']=='HSA', 'binds'] = preds[(tst['protein_name']=='HSA').values, 1]
tst.loc[tst['protein_name']=='sEH', 'binds'] = preds[(tst['protein_name']=='sEH').values, 2]
tst[['id', 'binds']].to_csv(paths.OUTPUT_DIR / 'submission.csv', index = False)

In [None]:
import os

# 使用可能なCPUコア数を取得
cpu_count = os.cpu_count()