# Leash Bio

- [DataSet](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset)
- 分子全体のsmilesをembeddingしたものを特徴料で使用
- simple 1dcnn model trained on 20 epochs.

## ref
- https://www.kaggle.com/code/yyyu54/pytorch-version-belka-1dcnn-starter-with-all-data
- https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data/notebook

- Notes: the embedding layer in pytorch is different than tensorflow, in which it doesn't have the mask_zero option, so I had to change the num of embedding to 37 to make it work. Please let me know if there's a better way to implement it!

In [1]:
exp_no = '002'
DEBUG = True

In [2]:
# !pip install fastparquet -q
# ! pip install --quiet "ipython[notebook]>=8.0.0, <8.12.0" "lightning>=2.0.0rc0" "setuptools==67.4.0" "torch>=1.8.1, <1.14.0" "torchvision" "pytorch-lightning>=1.4, <2.0.0" "torchmetrics>=0.7, <0.12"
# ! pip install -U torch_xla -q

In [5]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
from sklearn.metrics import average_precision_score as APS
from sklearn.model_selection import StratifiedKFold

import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

from pytorch_lightning import LightningModule
from pytorch_lightning import LightningDataModule, Trainer, seed_everything
from pytorch_lightning.callbacks import (
    ModelCheckpoint, 
    EarlyStopping,
    TQDMProgressBar,
    LearningRateMonitor,
    ModelCheckpoint,
    RichModelSummary,
    RichProgressBar,
)
from pytorch_lightning.loggers import TensorBoardLogger

In [6]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / "output/exp{exp_no}"
    
# set device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():    
    device = "cuda"
else:
    device = "cpu"
    
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print('Using', torch.cuda.device_count(), 'GPU(s)')
print('pytorch:', torch.__version__)

Using 0 GPU(s)
pytorch: 2.3.0


In [38]:
class config:
    SEED = 2024
    
    PREPROCESS = False
    EPOCHS = 30 #20
    BATCH_SIZE = 4096
    NUM_WORKERS = 16
    
    LR = 1e-3
    WEIGHT_DECAY = 1e-6
    MIXED_PRECISION = True
    
    NUM_FOLDS = 5    
    USE_NUM_FOLD = 1
    
class paths:    
    DATA_DIR = DATA_DIR
    TRAIN_PATH = DATA_DIR / "train.parquet"
    TEST_PATH = DATA_DIR / "test.parquet"
    OUTPUT_DIR = OUTPUT_DIR
    MODEL_WEIGHTS_DIR = OUTPUT_DIR / "bio-models-exp{exp_no}"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [40]:
print('fix seed')
seed_everything(config.SEED, workers=True)

FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']

Seed set to 2024


fix seed


# **Loda Data**

In [9]:
df_train = pd.read_parquet(paths.DATA_DIR / 'belka-enc-dataset/train_enc.parquet')
df_test = pd.read_parquet(paths.DATA_DIR / 'belka-enc-dataset/test_enc.parquet')
    
df_train = df_train.sample(100000).reset_index(drop=True)
df_test = df_test.sample(100000).reset_index(drop=True)

In [18]:
# set fold
skf = StratifiedKFold(n_splits=config.NUM_FOLDS, shuffle=True, random_state=42)
folds_list = []
for fold, (train_idx, valid_idx) in enumerate(skf.split(df_train, df_train[TARGETS].sum(1))):
    folds_list.append((train_idx, valid_idx))


# **Dataset & DataModule**

In [10]:
class BioDataset(torch.utils.data.Dataset):
    
    def __init__(
        self,
        array: np.ndarray,
        mode: str,
    ):
        super().__init__()
        self.array = array
        self.mode = mode
        
    def __len__(self):
        return len(self.array)
    
    def __getitem__(self, index):
        
        if self.mode=='train':
            X = self.array[index, :-3]
            y = self.array[index, -3:]
        else:
            X = self.array[index, :]
            y = np.zeros(3)
        
        output = {
            'X': torch.tensor(X, dtype=torch.float32),
            'y': torch.tensor(y, dtype=torch.float32),
        }        
        return output

In [13]:
# Check Dataset
dataset = BioDataset(df_train.values, 'train')
dataset[0]

{'X': tensor([ 8., 12., 27., 12., 12., 17., 33., 12., 18., 35., 12., 17., 33., 12.,
          4., 12., 12., 12., 35., 12.,  4.,  8., 19., 35., 12., 17., 33., 12.,
          4., 12., 17.,  7., 19., 12., 12., 12., 17., 31.,  9., 19., 12.,  4.,
          8., 17., 26., 28., 19., 33., 29., 30.,  2., 32., 19., 35., 18., 19.,
         35., 29., 35.,  5., 32., 27.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.]),
 'y': tensor([0., 0., 0.])}

In [39]:
# lightning data module
class BioDataModule(LightningDataModule):
    def __init__(self, df, train_idx, valid_idx):
        super().__init__()
        
        self.train_df = df.iloc[train_idx, :]
        self.valid_df = df.iloc[valid_idx, :]

    def train_dataloader(self):
        train_dataset = BioDataset(self.train_df, 'train')
        train_dataloader = torch.utils.data.DataLoader(
                                train_dataset,
                                batch_size=config.BATCH_SIZE,
                                shuffle=True,
                                num_workers=config.NUM_WORKERS,
                                pin_memory=True,
                                persistent_workers=True
                            )
        return train_dataloader

    def val_dataloader(self):
        valid_dataset = BioDataset(self.valid_df, 'valid')
        valid_dataloader = torch.utils.data.DataLoader(
                                            valid_dataset,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=False,
                                            num_workers=config.NUM_WORKERS,
                                            pin_memory=True,
                                            persistent_workers=True
                                        )
        return valid_dataloader

# **Model**

In [26]:
class BioModel(nn.Module):
    def __init__(self, 
                 input_dim=142, 
                 input_dim_embedding=37, 
                 hidden_dim=128, 
                 num_filters=32, 
                 output_dim=3):
        super(BioModel, self).__init__()
        
        self.input_dim = input_dim
        self.input_dim_embedding = input_dim_embedding
        self.hidden_dim = hidden_dim
        self.num_filters = num_filters
        self.output_dim = output_dim

        self.embedding = nn.Embedding(num_embeddings=self.input_dim_embedding, embedding_dim=self.hidden_dim, padding_idx=0)
        
        self.conv1 = nn.Conv1d(in_channels=self.hidden_dim, out_channels=self.num_filters, kernel_size=3, stride=1, padding=0)
        self.conv2 = nn.Conv1d(in_channels=self.num_filters, out_channels=self.num_filters*2, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv1d(in_channels=self.num_filters*2, out_channels=self.num_filters*3, kernel_size=3, stride=1, padding=0)
        
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        
        self.fc1 = nn.Linear(self.num_filters*3, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        
        self.dropout = nn.Dropout(0.1)
        
        self.output = nn.Linear(512, self.output_dim)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        
        x = self.global_max_pool(x).squeeze(2)
        
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        
        x = self.output(x)
        
        return x

In [33]:
# check model
# test model
dummy_model = BioModel()

total_params = sum(p.numel() for p in dummy_model.parameters())
print(f"Total number of parameters: {total_params}")

dummy_input = torch.randint(0, 37, (42, 142), dtype=torch.long)
output = dummy_model(dummy_input)
print(output.shape)

Total number of parameters: 1717059
torch.Size([42, 3])


# **Lightning Module**

In [36]:
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

In [8]:
class BioModel(LightningModule):
    def __init__(self, lr=1e-3, weight_decay=1e-6):
        
        super(BioModel, self).__init__()
       
        self.model = BioModel()
        self.lr = lr
        
        self.loss_fn = F.binary_cross_entropy_with_logits()
        
        self.validation_step_outputs = []
        
    def forward(self, X):
        pred = self.model(X)
        return pred
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        # == define learning rate scheduler ==
        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer,
            T_0=config.EPOCHS,
            T_mult=1,
            eta_min=1e-6,
            last_epoch=-1
        )
        
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss',
                'frequency': 1
            }
        }
        
    def training_step(self, batch, batch_idx):
        
        X, y = batch.pop('X'), batch.pop('y')
        logits = self(X)
        train_loss = self.loss_fn(logits, y)
        
        self.log('train_loss', train_loss,  on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
        return train_loss

    def validation_step(self, batch, batch_idx):
        
        X, y = batch.pop('X'), batch.pop('y')
        logits = self(X)
        valid_loss = self.loss_fn(logits, y)
        
        self.log('valid_loss', valid_loss, on_step=True, on_epoch=False, prog_bar=True, logger=True)
        
        self.validation_step_outputs.append({"valid_loss":valid_loss})
        
        return valid_loss
    
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        return optimizer
    
    def on_validation_epoch_end(self):
        
        outputs = self.validation_step_outputs
        
        # 各iterationごとのlossを平均
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        self.log("val_loss_epoch", avg_loss, prog_bar=True, logger=True)   
        
        self.validation_step_outputs.clear()
        
        return {'val_loss': avg_loss}

# Train & Inference

In [42]:
def predict_in_batches(model, df):
    
    model.to(device)
    model.eval()
    
    test_dataset = TensorDataset(torch.tensor(df[FEATURES].values, dtype=torch.int))
    test_loader = DataLoader(test_dataset,
                             batch_size=config.BATCH_SIZE, 
                             shuffle=False, 
                             num_workers=config.NUM_WORKERS, 
                             pin_memory=True)
    
    all_preds = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch[0].to(device)
            preds = model(inputs)
            all_preds.append(preds.cpu().numpy())
    
    return np.concatenate(all_preds, axis=0)

In [46]:
def run_training(fold_id, folds_list, df):
    print('================================================================')
    print(f"==== Running training for fold {fold_id} ====")
    
    # == init data module and model ==
    train_idx, valid_idx = folds_list[fold_id]
    model = BioModel()
    datamodule = BioDataModule(df, train_idx, valid_idx)
    
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(monitor='valid_loss',
                                          dirpath=paths.MODEL_WEIGHTS_DIR,
                                          save_top_k=1,
                                          save_last=False,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}",
                                          mode='min')
    early_stop_callback = EarlyStopping(monitor="valid_loss", mode="min", patience=5, verbose=True)
    callbacks_to_use = [checkpoint_callback,
                        early_stop_callback,
                        RichModelSummary(),
                        RichProgressBar(),
#                         TQDMProgressBar(refresh_rate=1)
                       ]

    # == init trainer ==
    trainer = Trainer(
        max_epochs=config.EPOCHS,
        callbacks=callbacks_to_use,
        accelerator=config.DEVICE,
        deterministic=True,
        gradient_clip_val=10,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
        logger=TensorBoardLogger('lightning_logs', name=f'exp{exp_no}_fold{fold_id}'),
    )
    
    # == Training ==
    trainer.fit(model, datamodule=datamodule)
    
    # == Prediction by best model==
    weights = torch.load(checkpoint_callback.best_model_path)['state_dict']
    model.load_state_dict(weights)
    
    valid_df = datamodule.valid_df
    
    preds_oof = predict_in_batches(valid_df, model)
    y_oof = valid_df[TARGETS].values
    score = APS(y_oof, preds_oof, average='micro')
    
    print(f'fold:{fold} | CV score = {score}')
    
    preds_test = predict_in_batches(df_test, model)
    
    del model, datamodule, trainer, preds_oof, y_oof
    gc.collect()
    
    return score, preds_test

In [None]:
# training
# torch.set_float32_matmul_precision('high')

all_preds = []
score_list = []
for fold_id in range(config.FOLDS):
    score, preds_test = run_training(fold_id, folds_list, df_train)
    
    score_list.append(score)
    all_preds.append(preds_test)
    
    # ファイルに書き込み
    score_list = [str(loss) for loss in score_list]
    with open(paths.OUTPUT_DIR / 'cv_result.txt', 'w') as file:
        file.write(', '.join(score_list))
        
preds = np.mean(all_preds, 0)

# **Submission**

In [47]:
test = pd.read_parquet(paths.DATA_DIR / 'test.parquet')
test['binds'] = 0
test.loc[test['protein_name']=='BRD4', 'binds'] = preds[(test['protein_name']=='BRD4').values, 0]
test.loc[test['protein_name']=='HSA', 'binds'] = preds[(test['protein_name']=='HSA').values, 1]
test.loc[test['protein_name']=='sEH', 'binds'] = preds[(test['protein_name']=='sEH').values, 2]
test[['id', 'binds']].to_csv(paths.OUTPUT_DIR / 'submission.csv', index = False)

NameError: name 'preds' is not defined