# Leash Bio

- [DataSet](https://www.kaggle.com/datasets/ahmedelfazouan/belka-enc-dataset)
- 各building blockごとにembedingして特徴量として使用
- simple 1dcnn model trained on 30 epochs.

## ref
- https://www.kaggle.com/code/yyyu54/pytorch-version-belka-1dcnn-starter-with-all-data
- https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data/notebook

- Notes: the embedding layer in pytorch is different than tensorflow, in which it doesn't have the mask_zero option, so I had to change the num of embedding to 37 to make it work. Please let me know if there's a better way to implement it!

In [31]:
exp_no = '003'
DEBUG = False
data_ratio = 1/3

In [32]:
import psutil

# CPUコア数（論理コア）
logical_cores = psutil.cpu_count(logical=True)
# CPUコア数（物理コア）
physical_cores = psutil.cpu_count(logical=False)

print(f"論理コア数: {logical_cores}")
print(f"物理コア数: {physical_cores}")

論理コア数: 48
物理コア数: 24


In [33]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
from sklearn.metrics import average_precision_score as APS
from sklearn.model_selection import StratifiedKFold

import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

from pytorch_lightning import LightningModule
from pytorch_lightning import LightningDataModule, Trainer
# seed_everything
from pytorch_lightning.callbacks import (
    ModelCheckpoint, 
    EarlyStopping,
    TQDMProgressBar,
    LearningRateMonitor,
    ModelCheckpoint,
    RichModelSummary,
    RichProgressBar,
)
from pytorch_lightning.loggers import TensorBoardLogger
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import warnings
warnings.simplefilter('ignore')

from funcs.tokenize import tokenize_smiles

In [34]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / f"output/exp{exp_no}"
    
# set device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():    
    device = "cuda"
else:
    device = "cpu"
    
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print('Using', torch.cuda.device_count(), 'GPU(s)')
print('pytorch:', torch.__version__)

Using 1 GPU(s)
pytorch: 2.0.0


In [35]:
class config:
    SEED = 2024
    
    PREPROCESS = False
    EPOCHS = 30 if not DEBUG else 1
    BATCH_SIZE = 4096
    NUM_WORKERS = 16
    
    LR = 1e-3
    WEIGHT_DECAY = 1e-6
    MIXED_PRECISION = True
    
    NUM_FOLDS = 5    
    USE_NUM_FOLD = 1
    
class paths:    
    DATA_DIR = DATA_DIR
    TRAIN_PATH = DATA_DIR / "train.parquet"
    TEST_PATH = DATA_DIR / "test.parquet"
    OUTPUT_DIR = OUTPUT_DIR
    MODEL_WEIGHTS_DIR = OUTPUT_DIR / f"bio-models-exp{exp_no}"
    
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-train-set"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [36]:
print('fix seed')

def my_seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
# seed_everything(config.SEED, workers=True)
my_seed_everything(config.SEED)

fix seed


# **Loda Data**

In [37]:
bb_cols = ['buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles']
TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

df_train = pd.read_parquet(paths.DATA_DIR / 'shrunken-train-set/train.parquet', columns=bb_cols + TARGETS)
    
if DEBUG:
    df_train = df_train.sample(100000).reset_index(drop=True)
    
len_train = int(len(df_train)*data_ratio)
df_train = df_train.sample(len_train).reset_index(drop=True)

In [38]:
# building block smiles
# NOTE: trainとtestのindexとsmilesは一致していないっぽい
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_1.p', 'rb') as file:
    train_dicts_bb1 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_2.p', 'rb') as file:
    train_dicts_bb2 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'train_dicts/BBs_dict_reverse_3.p', 'rb') as file:
    train_dicts_bb3 = pickle.load(file)

with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_1_test.p', 'rb') as file:
    test_dicts_bb1 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_2_test.p', 'rb') as file:
    test_dicts_bb2 = pickle.load(file)
with open(paths.SHRUNKEN_DATA_DIR / 'test_dicts/BBs_dict_reverse_3_test.p', 'rb') as file:
    test_dicts_bb3= pickle.load(file)

test_dicts_bb1_reverse = {val:key for key, val in test_dicts_bb1.items()}
test_dicts_bb2_reverse = {val:key for key, val in test_dicts_bb2.items()}
test_dicts_bb3_reverse = {val:key for key, val in test_dicts_bb3.items()}

In [39]:
df_test = pd.read_parquet(paths.DATA_DIR / 'test.parquet')
df_test.drop(['molecule_smiles'], axis=1, inplace=True)

df_test['buildingblock1_smiles'] = df_test['buildingblock1_smiles'].map(test_dicts_bb1_reverse)
df_test['buildingblock2_smiles'] = df_test['buildingblock2_smiles'].map(test_dicts_bb2_reverse)
df_test['buildingblock3_smiles'] = df_test['buildingblock3_smiles'].map(test_dicts_bb3_reverse)

In [40]:
# set fold
skf = StratifiedKFold(n_splits=config.NUM_FOLDS, shuffle=True, random_state=42)
folds_list = []
for fold, (train_idx, valid_idx) in enumerate(skf.split(df_train, df_train[TARGETS].sum(1))):
    folds_list.append((train_idx, valid_idx))

# **Make Features**

In [41]:
# tokenize smiles
df_train_bb1 = tokenize_smiles(train_dicts_bb1)
df_train_bb2 = tokenize_smiles(train_dicts_bb2)
df_train_bb3 = tokenize_smiles(train_dicts_bb3)
df_test_bb1 = tokenize_smiles(test_dicts_bb1)
df_test_bb2 = tokenize_smiles(test_dicts_bb2)
df_test_bb3 = tokenize_smiles(test_dicts_bb3)


# **Dataset & DataModule**

In [42]:
df_test

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,protein_name
0,295246830,0,17,17,BRD4
1,295246831,0,17,17,HSA
2,295246832,0,17,17,sEH
3,295246833,0,17,87,BRD4
4,295246834,0,17,87,HSA
...,...,...,...,...,...
1674891,296921721,340,1051,292,HSA
1674892,296921722,340,1051,292,sEH
1674893,296921723,340,1051,940,BRD4
1674894,296921724,340,1051,940,HSA


In [43]:
class BioDataset(torch.utils.data.Dataset):
    
    def __init__(
        self,
        df: pd.DataFrame,
        df_bb1: pd.DataFrame,
        df_bb2: pd.DataFrame,
        df_bb3: pd.DataFrame,
        mode = 'train'
    ):
        super().__init__()
        self.df = df.values
        self.bb1_array = df_bb1.values
        self.bb2_array = df_bb2.values
        self.bb3_array = df_bb3.values
        self.mode = mode
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        
        row = self.df[index, :]

        x1 = self.bb1_array[row[0], :]
        x2 = self.bb2_array[row[1], :]
        x3 = self.bb3_array[row[2], :]
        X = np.concatenate([x1, x2, x3])
        
        if self.mode == 'train':
            y = row[-3:]
        else:
            y = np.zeros(3)
        
        output = {
            'X': torch.tensor(X, dtype=torch.int),
            'y': torch.tensor(y, dtype=torch.float16)
        }        
        return output

In [44]:
# Check Dataset
if DEBUG:
    dataset = BioDataset(df_train, df_train_bb1, df_train_bb2, df_train_bb3, mode='valid')
    X = dataset[0]['X']
    y = dataset[0]['y']
    print(X.shape)
    print(y.shape)

In [45]:
# lightning data module
class BioDataModule(LightningDataModule):
    def __init__(self, df_train, train_idx, valid_idx):
        super().__init__()
        
        self.train_df = df_train.iloc[train_idx, :]
        self.valid_df = df_train.iloc[valid_idx, :]
        self.test_df = df_test.copy()

    def train_dataloader(self):
        train_dataset = BioDataset(self.train_df, df_train_bb1, df_train_bb2, df_train_bb3, mode='train')
        train_dataloader = torch.utils.data.DataLoader(
                                train_dataset,
                                batch_size=config.BATCH_SIZE,
                                shuffle=True,
                                num_workers=config.NUM_WORKERS,
                                pin_memory=True,
                                persistent_workers=True,
                                drop_last=True,
                            )
        return train_dataloader

    def val_dataloader(self):
        valid_dataset = BioDataset(self.valid_df, df_train_bb1, df_train_bb2, df_train_bb3, mode='valid')
        valid_dataloader = torch.utils.data.DataLoader(
                                            valid_dataset,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=False,
                                            num_workers=config.NUM_WORKERS,
                                            pin_memory=True,
                                            persistent_workers=True,
                                            drop_last=False,
                                        )
        return valid_dataloader

# **Model**

In [46]:
class BioModel(nn.Module):
    def __init__(self, 
                 input_dim=240, 
                 input_dim_embedding=37, 
                 hidden_dim=128, 
                 num_filters=32, 
                 output_dim=3):
        super(BioModel, self).__init__()
        
        self.input_dim = input_dim
        self.input_dim_embedding = input_dim_embedding
        self.hidden_dim = hidden_dim
        self.num_filters = num_filters
        self.output_dim = output_dim

        self.embedding = nn.Embedding(num_embeddings=self.input_dim_embedding, embedding_dim=self.hidden_dim, padding_idx=0)
        
        self.conv1 = nn.Conv1d(in_channels=self.hidden_dim, out_channels=self.num_filters, kernel_size=3, stride=1, padding=0)
        self.conv2 = nn.Conv1d(in_channels=self.num_filters, out_channels=self.num_filters*2, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv1d(in_channels=self.num_filters*2, out_channels=self.num_filters*3, kernel_size=3, stride=1, padding=0)
        
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        
        self.fc1 = nn.Linear(self.num_filters*3, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        
        self.dropout = nn.Dropout(0.1)
        
        self.output = nn.Linear(512, self.output_dim)

    def forward(self, x):
        # print(x.shape)
        x = self.embedding(x).permute(0, 2, 1)
        # print(x.shape)
        
        x = F.relu(self.conv1(x))
        # print(x.shape)
        x = F.relu(self.conv2(x))
        # print(x.shape)
        x = F.relu(self.conv3(x))
        # print(x.shape)
        
        x = self.global_max_pool(x).squeeze(2)
        # print(x.shape)
        
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        # print(x.shape)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        # print(x.shape)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        
        # print(x.shape)
        x = self.output(x)
        # print(x.shape)
        
        return x

In [47]:
# check model
if DEBUG:
    dummy_model = BioModel()
    total_params = sum(p.numel() for p in dummy_model.parameters())
    print(f"Total number of parameters: {total_params}")

    dummy_input = torch.randint(0, 37, (1, 240), dtype=torch.long)
    output = dummy_model(dummy_input)
    print(output.shape)
    print(output)

# **Lightning Module**

In [48]:
class BioModule(LightningModule):
    def __init__(self):
        
        super(BioModule, self).__init__()
       
        self.model = BioModel()
        self.validation_step_outputs = []
        
    def forward(self, X):
        pred = self.model(X)
        return pred
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        # == define learning rate scheduler ==
        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer,
            T_0=config.EPOCHS,
            T_mult=1,
            eta_min=1e-6,
            last_epoch=-1
        )
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss_epoch',
                'frequency': 1
            }
        }
        
    def training_step(self, batch, batch_idx):
        
        X, y = batch.pop('X'), batch.pop('y')
        logits = self(X)
        train_loss = F.binary_cross_entropy_with_logits(logits, y)
        
        self.log('train_loss', train_loss,  on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=X.size(0))
        
        return train_loss

    def validation_step(self, batch, batch_idx):
        
        X, y = batch.pop('X'), batch.pop('y')
        logits = self(X)
        valid_loss = F.binary_cross_entropy_with_logits(logits, y)
        
        self.log('valid_loss', valid_loss, on_step=True, on_epoch=False, prog_bar=True, logger=True, batch_size=X.size(0))
        
        self.validation_step_outputs.append({"valid_loss":valid_loss})
        
        return valid_loss

    
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader

    
    def on_validation_epoch_end(self):
        
        outputs = self.validation_step_outputs
        
        # 各iterationごとのlossを平均
        avg_loss = torch.stack([x['valid_loss'] for x in outputs]).mean()
        self.log("val_loss_epoch", avg_loss, prog_bar=True, logger=True)   
        
        self.validation_step_outputs.clear()
        
        return {'val_loss': avg_loss}

# Train & Inference

In [49]:
def predict_in_batches(model, df, df_bb1, df_bb2, df_bb3):
    
    model.to(device)
    model.eval()
    
    dataset = BioDataset(df, df_bb1, df_bb2, df_bb3, mode='valid')
    dataloader = torch.utils.data.DataLoader(
                                        dataset,
                                        batch_size=config.BATCH_SIZE,
                                        shuffle=False,
                                        num_workers=config.NUM_WORKERS,
                                        pin_memory=True,
                                        persistent_workers=True,
                                        drop_last=False,
                                    )

    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['X'].to(device)
            logits = model(inputs)
            preds = torch.sigmoid(logits)
            all_preds.append(preds.cpu().numpy())
    
    return np.concatenate(all_preds, axis=0)

In [50]:
def run_training(fold_id, folds_list, df):
    print('================================================================')
    print(f"==== Running training for fold {fold_id} ====")
    
    # == init data module and model ==
    train_idx, valid_idx = folds_list[fold_id]
    model = BioModule()
    datamodule = BioDataModule(df, train_idx, valid_idx)
    
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(monitor='val_loss_epoch',
                                          dirpath=paths.MODEL_WEIGHTS_DIR,
                                          save_top_k=1,
                                          save_last=False,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}",
                                          mode='min')
    early_stop_callback = EarlyStopping(monitor="val_loss_epoch", mode="min", patience=5, verbose=True)
    callbacks_to_use = [checkpoint_callback,
                        early_stop_callback,
                        RichModelSummary(),
                        RichProgressBar(),
#                         TQDMProgressBar(refresh_rate=1)
                       ]

    # == init trainer ==
    trainer = Trainer(
        max_epochs=config.EPOCHS,
        callbacks=callbacks_to_use,
        accelerator=device,
        deterministic=False,
#         gradient_clip_val=10,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
        logger=TensorBoardLogger('lightning_logs', name=f'exp{exp_no}_fold{fold_id}'),
    )
    
    # == Training ==
    trainer.fit(model, datamodule=datamodule)
    
    # == Prediction by best model==
    weights = torch.load(checkpoint_callback.best_model_path)['state_dict']
    model.load_state_dict(weights)
    
    valid_df = datamodule.valid_df
    
    preds_oof = predict_in_batches(model, valid_df, df_train_bb1, df_train_bb2, df_train_bb3)
    y_oof = valid_df[TARGETS].values
    score = APS(y_oof, preds_oof, average='micro')
    
    print(f'fold:{fold} | CV score = {score}')
    
    preds_test = predict_in_batches(model, df_test.drop('id',axis=1), df_test_bb1, df_test_bb2, df_test_bb3)
    
    del model, datamodule, trainer, preds_oof, y_oof
    gc.collect()
    
    return score, preds_test

In [None]:
# training
# torch.set_float32_matmul_precision('high')

# tokenizerの warning対策
os.environ["TOKENIZERS_PARALLELISM"] = "false"

all_preds = []
score_list = []

for fold_id in range(config.NUM_FOLDS):
    score, preds_test = run_training(fold_id, folds_list, df_train)
    
    score_list.append(score)
    all_preds.append(preds_test)
    
    # ファイルに書き込み
    score_list = [str(loss) for loss in score_list]
    with open(paths.OUTPUT_DIR / 'cv_result.txt', 'w') as file:
        file.write(', '.join(score_list))
        
    preds = np.mean(all_preds, 0)
    
    # test = pd.read_parquet(paths.DATA_DIR / 'test.parquet')
    df_test['binds'] = 0
    df_test.loc[df_test['protein_name']=='BRD4', 'binds'] = preds[df_test['protein_name']=='BRD4', 0]
    df_test.loc[df_test['protein_name']=='HSA', 'binds'] = preds[df_test['protein_name']=='HSA', 1]
    df_test.loc[df_test['protein_name']=='sEH', 'binds'] = preds[df_test['protein_name']=='sEH', 2]
    df_test[['id', 'binds']].to_csv(paths.OUTPUT_DIR / f'submission_fold{fold_id}.csv', index = False)

# **Submission**

In [None]:
# df_test['binds'] = 0
# df_test.loc[df_test['protein_name']=='BRD4', 'binds'] = preds[df_test['protein_name']=='BRD4', 0]
# df_test.loc[df_test['protein_name']=='HSA', 'binds'] = preds[df_test['protein_name']=='HSA', 1]
# df_test.loc[df_test['protein_name']=='sEH', 'binds'] = preds[df_test['protein_name']=='sEH', 2]
# df_test[['id', 'binds']].to_csv(paths.OUTPUT_DIR / f'submission_fold{fold_id}.csv', index = False)