# Leash Bio

- positive data多めに使う
- EMAアルゴリズム
- EPOCH100, patience10

## ref
- https://www.kaggle.com/code/yyyu54/pytorch-version-belka-1dcnn-starter-with-all-data
- https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data/notebook

In [1]:
exp_no = '059'
DEBUG = True
data_ratio = 1/5

infer_only=False
fold_list=[0,1,2,3,4]

In [2]:
# !pip install rdkit
# !pip install mordred
!pip install timm

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
from sklearn.metrics import average_precision_score as APS
from sklearn.model_selection import StratifiedKFold

import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

from pytorch_lightning import LightningModule
from pytorch_lightning import LightningDataModule, Trainer
# seed_everything
from pytorch_lightning.callbacks import (
    ModelCheckpoint, 
    EarlyStopping,
    ModelCheckpoint,
    RichModelSummary,
    RichProgressBar,
)
from pytorch_lightning.loggers import TensorBoardLogger
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import timm
from timm.utils import ModelEmaV2

from funcs.utils import find_latest_ckpt_path, del_old_ckpt_path
from funcs.calc_descriptor import calc_rdkit_descriptors, calc_ecfp4_descriptors
from funcs.tokenize import tokenize_smiles
from funcs.tokenize import tokenize_ChemBEATa

import warnings
warnings.simplefilter('ignore')

In [4]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / f"output/exp{exp_no}"
    
# set device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():    
    device = "cuda"
else:
    device = "cpu"
    
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print('Using', torch.cuda.device_count(), 'GPU(s)')
print('pytorch:', torch.__version__)

Using 0 GPU(s)
pytorch: 2.3.1


In [5]:
class config:
    SEED = 2024
    
    PREPROCESS = False
    EPOCHS = 20 #20
    PATIENCE = 10 #20
    BATCH_SIZE = 4096
    NUM_WORKERS = 16
    
    USE_EMA = False
    
    LR = 1e-3
    WEIGHT_DECAY = 1e-6
    MIXED_PRECISION = True
    
    NUM_FOLDS = 5    
    USE_NUM_FOLD = 1
    
class paths:    
    DATA_DIR = DATA_DIR
    OUTPUT_DIR = OUTPUT_DIR
    MODEL_WEIGHTS_DIR = OUTPUT_DIR / f"bio-models-exp{exp_no}"
    
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-data-capping"

    TRAIN_PATH = SHRUNKEN_DATA_DIR / "train.parquet"
    TEST_PATH = SHRUNKEN_DATA_DIR / "test.parquet"
    SUB_PATH = SHRUNKEN_DATA_DIR / "sub.parquet"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [6]:
print('fix seed')

def my_seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
# seed_everything(config.SEED, workers=True)
my_seed_everything(config.SEED)

fix seed


# **Loda Data**

In [7]:
bb_cols = ['buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
           'buildingblock1_smiles_scaffold', "buildingblock2_smiles_scaffold", "buildingblock3_smiles_scaffold",
           'fold']

TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

df_train = pd.read_parquet(paths.TRAIN_PATH, columns=bb_cols + TARGETS)

if DEBUG:
    df_train = df_train.sample(100000).reset_index(drop=True)
else:
    # 全てのpositiveサンプルとnegativeサンプルをあわせて、希望の数のdatasetができる様にする
    positive = df_train[(df_train[TARGETS]>0).any(axis=1)]
    negative = df_train[(df_train[TARGETS]==0).all(axis=1)]

    len_train = int(len(df_train)*data_ratio)
    use_negative_sample = len_train - len(positive)

    df_train = pd.concat([negative.sample(use_negative_sample, random_state=config.SEED), positive],axis=0).reset_index(drop=True)

In [8]:
# ソフトラベリングを用意する
bb1_mean = df_train.groupby('buildingblock1_smiles')[TARGETS].mean()
bb2_mean = df_train.groupby('buildingblock2_smiles')[TARGETS].mean()
bb3_mean = df_train.groupby('buildingblock3_smiles')[TARGETS].mean()

for target in TARGETS:
    df_train[f'{target}_bb1'] = df_train['buildingblock1_smiles'].map(bb1_mean[target].to_dict())
    df_train[f'{target}_bb2'] = df_train['buildingblock2_smiles'].map(bb2_mean[target].to_dict())
    df_train[f'{target}_bb3'] = df_train['buildingblock3_smiles'].map(bb3_mean[target].to_dict())
    
df_train['binds_BRD4'] = df_train['binds_BRD4'] + df_train['binds_BRD4_bb1'] + df_train['binds_BRD4_bb2'] + df_train['binds_BRD4_bb3']
df_train['binds_HSA'] = df_train['binds_HSA'] + df_train['binds_HSA_bb1'] + df_train['binds_HSA_bb2'] + df_train['binds_HSA_bb3']
df_train['binds_sEH'] = df_train['binds_sEH'] + df_train['binds_sEH_bb1'] + df_train['binds_sEH_bb2'] + df_train['binds_sEH_bb3']

df_train[TARGETS] = df_train[TARGETS].clip(0, 1)

df_train.drop(columns=[f'{target}_bb1' for target in TARGETS], inplace=True)
df_train.drop(columns=[f'{target}_bb2' for target in TARGETS], inplace=True)
df_train.drop(columns=[f'{target}_bb3' for target in TARGETS], inplace=True)

In [9]:
# submitt用のデータ
df_test = pd.read_parquet(paths.SUB_PATH)
df_test.head()

# preudolabeling用zw
cols = ['buildingblock1_smiles', 'buildingblock2_smiles',
       'buildingblock3_smiles', 'buildingblock1_smiles_scaffold',
       'buildingblock2_smiles_scaffold', 'buildingblock3_smiles_scaffold']
df_pseudo = df_test[cols].drop_duplicates().reset_index(drop=True)
df_pseudo.head()

Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,buildingblock1_smiles_scaffold,buildingblock2_smiles_scaffold,buildingblock3_smiles_scaffold
0,0,58,58,91,507,507
1,0,58,160,91,507,776
2,0,58,171,91,507,541
3,0,58,372,91,507,907
4,0,58,561,91,507,543


In [10]:
# 変換用辞書を読み込む
with open(paths.SHRUNKEN_DATA_DIR / 'bb1_smiles2idx.pickle', mode='rb') as f:
    bb1_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb23_smiles2idx.pickle', mode='rb') as f:
    bb23_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb1_scaffold_smiles2idx.pickle', mode='rb') as f:
    bb1_scaffold_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb23_scaffold_smiles2idx.pickle', mode='rb') as f:
    bb23_scaffold_smiles2idx = pickle.load(f)
    
bb1_idx2smiles = {v:k for k,v in bb1_smiles2idx.items()}
bb23_idx2smiles = {v:k for k,v in bb23_smiles2idx.items()}
bb1_scaffold_idx2smiles = {v:k for k,v in bb1_scaffold_smiles2idx.items()}
bb23_scaffold_idx2smiles = {v:k for k,v in bb23_scaffold_smiles2idx.items()}

# **Make Features**

In [11]:
# 標準化
from sklearn.preprocessing import StandardScaler

def standardization(df_list):
    # 複数のdfをまとめて標準化
    df_all = pd.concat(df_list,axis=0)
    df_all.drop_duplicates(inplace=True)
    df_all.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # 標準偏差が0の列を削除
    df_all = df_all.loc[:, df_all.std() != 0]

    # standard scaling
    scaler = StandardScaler()
    scaler.fit(df_all)

    standardized_df_list = []
    for df_temp in df_list:
        df_temp = df_temp.loc[:, df_all.columns]
        df_temp_std = pd.DataFrame(scaler.transform(df_temp), 
                                index=df_temp.index, 
                                columns=df_temp.columns)
        standardized_df_list.append(df_temp_std)
        
    return standardized_df_list


def remove_std0(df_list):
    # 標準偏差が0の列を削除
    df_all = pd.concat(df_list,axis=0)
    df_all.drop_duplicates(inplace=True)
    df_all = df_all.loc[:, df_all.std() != 0]
    
    standardized_df_list = []
    for df_temp in df_list:
        df_temp = df_temp.loc[:, df_all.columns]
        standardized_df_list.append(df_temp)
        
    return standardized_df_list

In [19]:
# tokenize smiles
df_bb1_token = tokenize_ChemBEATa(bb1_idx2smiles)
df_bb23_token = tokenize_ChemBEATa(bb23_idx2smiles)
df_bb1_scf_token = tokenize_ChemBEATa(bb1_scaffold_idx2smiles)
df_bb23_scf_token = tokenize_ChemBEATa(bb23_scaffold_idx2smiles)


Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at DeepCh

In [32]:
# # Rdkit記述子をまとめて標準化
# df_list_rdkit = [
#             df_bb1_rdkit,
#             df_bb23_rdkit, 
#             df_bb1_scf_rdkit, 
#             df_bb23_scf_rdkit,
#             ]
# df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit = standardization(df_list_rdkit)
        
# # ECFP4記述子をまとめて標準化
# df_list_ecfp4 = [
#             df_bb1_ecfp4,
#             df_bb23_ecfp4, 
#             df_bb1_scf_ecfp4, 
#             df_bb23_scf_ecfp4,
#             ]
# df_bb1_ecfp4,df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4 = remove_std0(df_list_ecfp4)



In [22]:
# len_rdkit = df_bb1_rdkit.shape[1]
# len_ecfp4 = df_bb1_ecfp4.shape[1]
len_token = df_bb1_token.shape[1]
print(len_token)
# print(len_rdkit, len_ecfp4)

384


# **Dataset & DataModule**

In [23]:
class BioDataset(torch.utils.data.Dataset):
    
    def __init__(
        self,
        df: pd.DataFrame,
        df_bb1_token: pd.DataFrame,
        df_bb23_token: pd.DataFrame,
        df_bb1_scf_token: pd.DataFrame,
        df_bb23_scf_token: pd.DataFrame,
        mode = 'train'
    ):
        super().__init__()
        
        assert mode in ['train', 'valid', 'test']
        self.mode = mode
        
        meta_cols = ["buildingblock1_smiles", # 0
                     "buildingblock2_smiles", # 1
                     "buildingblock3_smiles", # 2
                     "buildingblock1_smiles_scaffold", # 3
                     "buildingblock2_smiles_scaffold", # 4
                     "buildingblock3_smiles_scaffold", # 5    
                     ]
        
        if (self.mode == 'train') or (self.mode == 'valid'):
            meta_cols += TARGETS
            
        df = df[meta_cols]
        self.df = df[meta_cols].values

        self.bb1_token = df_bb1_token.values
        self.bb23_token = df_bb23_token.values
        self.bb1_scf_token = df_bb1_scf_token.values
        self.bb23_scf_token = df_bb23_scf_token.values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        
        row = self.df[index, :]
        
        bb1_idx = int(row[0])
        bb2_idx = int(row[1])
        bb3_idx = int(row[2])
        bb1_scf_idx = int(row[3])
        bb2_scf_idx = int(row[4])
        bb3_scf_idx = int(row[5])
        
        # augmentation
        if self.mode == 'train':
            if np.random.rand() < 0.5:
                bb2_idx, bb3_idx, bb2_scf_idx, bb3_scf_idx = bb3_idx, bb2_idx, bb3_scf_idx, bb2_scf_idx

        # bb1_desc = self.bb1_desc[bb1_idx, :]
        # bb2_desc = self.bb23_desc[bb2_idx, :]
        # bb3_desc = self.bb23_desc[bb3_idx, :]
        # bb1_scf_desc = self.bb1_scf_desc[bb1_scf_idx, :]
        # bb2_scf_desc = self.bb23_scf_desc[bb2_scf_idx, :]
        # bb3_scf_desc = self.bb23_scf_desc[bb3_scf_idx, :]
        
        # bb1_ecfp = self.bb1_ecfp[bb1_idx, :]
        # bb2_ecfp = self.bb23_ecfp[bb2_idx, :]
        # bb3_ecfp = self.bb23_ecfp[bb3_idx, :]
        # bb1_scf_ecfp = self.bb1_scf_ecfp[bb1_scf_idx, :]
        # bb2_scf_ecfp = self.bb23_scf_ecfp[bb2_scf_idx, :]
        # bb3_scf_ecfp = self.bb23_scf_ecfp[bb3_scf_idx, :]

        bb1_token = self.bb1_token[bb1_idx, :]
        bb2_token = self.bb23_token[bb2_idx, :]
        bb3_token = self.bb23_token[bb3_idx, :]
        bb1_scf_token = self.bb1_scf_token[bb1_scf_idx, :]
        bb2_scf_token = self.bb23_scf_token[bb2_scf_idx, :]
        bb3_scf_token = self.bb23_scf_token[bb3_scf_idx, :]
        
        # desc1 = np.concatenate([bb1_desc, bb2_desc, bb3_desc, bb1_scf_desc, bb2_scf_desc, bb3_scf_desc])
        # desc2 = np.concatenate([bb1_ecfp, bb2_ecfp, bb3_ecfp, bb1_scf_ecfp, bb2_scf_ecfp, bb3_scf_ecfp])
        token= np.concatenate([bb1_token, bb2_token, bb3_token, bb1_scf_token, bb2_scf_token, bb3_scf_token])
                
        if (self.mode == 'train') or (self.mode == 'valid'):
            y = row[-3:]
        else:
            y = np.zeros(3)
        
        output = {
            # 'desc1': torch.tensor(desc1, dtype=torch.float32),
            # 'desc2': torch.tensor(desc2, dtype=torch.float32),
            'token': torch.tensor(token, dtype=torch.int),
            'y': torch.tensor(y, dtype=torch.float16)
        }        
        return output

In [26]:
# Check Dataset
if DEBUG:
    dataset = BioDataset(df_train, 
                            # df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                            # df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                            df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                            mode='train')
    # desc1 = dataset[0]['desc1']
    # desc2 = dataset[0]['desc2']
    token = dataset[0]['token']
    y = dataset[0]['y']
    # print(desc1.shape)
    # print(desc2.shape)
    print(token.shape)
    print(y.shape)

torch.Size([2304])
torch.Size([3])


In [27]:
# lightning data module
class BioDataModule(LightningDataModule):
    def __init__(self, df_train, fold_id):
        super().__init__()
        
        self.train_df = df_train[df_train['fold'] != fold_id]
        self.valid_df = df_train[df_train['fold'] == fold_id]

    def train_dataloader(self):
        train_dataset = BioDataset(self.train_df, 
                                # df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                # df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                                 df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                                   mode='train')
        train_dataloader = torch.utils.data.DataLoader(
                                train_dataset,
                                batch_size=config.BATCH_SIZE,
                                shuffle=True,
                                num_workers=config.NUM_WORKERS,
                                pin_memory=True,
                                persistent_workers=True,
                                drop_last=True,
                            )
        return train_dataloader

    def val_dataloader(self):
        valid_dataset = BioDataset(self.valid_df, 
                                # df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                # df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                                 df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                                   mode='valid')
        valid_dataloader = torch.utils.data.DataLoader(
                                            valid_dataset,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=False,
                                            num_workers=config.NUM_WORKERS,
                                            pin_memory=True,
                                            persistent_workers=True,
                                            drop_last=False,
                                        )
        return valid_dataloader

# **Model**

In [47]:
len_token

384

In [96]:
class BioModel(nn.Module):
    def __init__(self):
        
        super(BioModel, self).__init__()
        self.gru = nn.GRU(384, 128, num_layers=2, batch_first=True)
        self.head = nn.Sequential(
            nn.Linear(768, 1024),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(512, 3),
            )

    def forward(self, x):
        x = x.reshape(x.shape[0], 4, 384)
        out, hn = self.gru(x)
        out = out.reshape(out.shape[0], -1)
        
        out = torch.cat([out, hn.reshape(hn.shape[1], -1)], dim=1)
        out = self.head(out)
        
        return out


In [97]:
# check model
if DEBUG:
    dummy_model = BioModel()
    total_params = sum(p.numel() for p in dummy_model.parameters())
    print(f"Total number of parameters: {total_params}")

    token = torch.rand((64, len_token*4), dtype=torch.float32)
    output = dummy_model(token)
    print(output.shape)

Total number of parameters: 1610243
torch.Size([64, 3])


# **Lightning Module**

In [72]:
def calc_score(y_preds, y_true):
    
    y_true[y_true < 1] = 0
    
    score_BRD4 = APS(y_true[:,0], y_preds[:,0])
    score_HSA = APS(y_true[:,1], y_preds[:,1])
    score_sEH = APS(y_true[:,2], y_preds[:,2])
    score = (score_BRD4 + score_HSA + score_sEH) / 3
    
    return score_BRD4, score_HSA, score_sEH, score

In [78]:
class BioModule(LightningModule):
    def __init__(self):
        
        super(BioModule, self).__init__()
       
        self.model = BioModel()
        
        if config.USE_EMA:
            self.ema = ModelEmaV2(self.model, decay=0.999)
        
        self.validation_step_outputs = []
        self.loss_func = nn.BCEWithLogitsLoss()
        
    def forward(self, token):
        pred = self.model(token)
        return pred
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        # == define learning rate scheduler ==
        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer,
            T_0=config.EPOCHS,
            T_mult=1,
            eta_min=1e-6,
            last_epoch=-1
        )
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'valid_loss_epoch',
                'frequency': 1
            }
        }
        
    def training_step(self, batch, batch_idx):
        
        # desc1, desc2, token, y = batch.pop('desc1'), batch.pop('desc2'), batch.pop('token'), batch.pop('y')
        token, y = batch.pop('token'), batch.pop('y')
        logits = self(token)
        train_loss = self.loss_func(logits, y)
        
        self.log('train_loss', train_loss,  on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=token.size(0))
        
        # EMAの更新
        if config.USE_EMA:
            self.ema.update(self.model)
        
        return train_loss

    def validation_step(self, batch, batch_idx):
        
        # desc1, desc2, token, y = batch.pop('desc1'), batch.pop('desc2'), batch.pop('token'), batch.pop('y')
        token, y = batch.pop('token'), batch.pop('y')
        logits = self(token)
        preds = torch.sigmoid(logits)
        
        valid_loss = self.loss_func(logits, y)
        
        self.log('valid_loss', valid_loss, on_step=True, on_epoch=False, prog_bar=True, logger=True, batch_size=token.size(0))
        
        self.validation_step_outputs.append({"valid_loss":valid_loss, "preds":preds, "targets":y})
        
        return valid_loss

    
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader
    
    def calc_score(self, y_preds, y_true):
        return calc_score(y_preds, y_true)

    
    def on_validation_epoch_end(self):
        
        outputs = self.validation_step_outputs
        
        # 各iterationごとのlossを平均
        avg_loss = torch.stack([x['valid_loss'] for x in outputs]).mean()
        self.log("valid_loss_epoch", avg_loss, prog_bar=True, logger=True)
        
        # scoreを計算
        y_preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()
        y_true = torch.cat([x['targets'] for x in outputs]).detach().cpu().numpy()
        
        score = self.calc_score(y_preds, y_true)[-1]
        self.log("valid_score", score, prog_bar=True, logger=True)
        
        self.validation_step_outputs.clear()
        
        return {'valid_loss_epoch': avg_loss, "valid_score":score}

# Train & Inference

In [79]:
def predict_in_batches(model, df, 
                    #    df_bb1_1, df_bb23_1, df_bb1_scf_1, df_bb23_scf_1,
                    #    df_bb1_2, df_bb23_2, df_bb1_scf_2, df_bb23_scf_2,
                       df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                       mode):
    
    model.to(device)
    model.eval()
    
    dataset = BioDataset(df, 
                    #    df_bb1_1, df_bb23_1, df_bb1_scf_1, df_bb23_scf_1,
                    #    df_bb1_2, df_bb23_2, df_bb1_scf_2, df_bb23_scf_2,
                       df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                         mode=mode)
    dataloader = torch.utils.data.DataLoader(
                                        dataset,
                                        batch_size=config.BATCH_SIZE,
                                        shuffle=False,
                                        num_workers=config.NUM_WORKERS,
                                        pin_memory=True,
                                        persistent_workers=True,
                                        drop_last=False,
                                    )

    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            # desc1 = batch['desc1'].to(device)
            # desc2 = batch['desc2'].to(device)
            token = batch['token'].to(device)
            # logits = model(desc1, desc2, token)
            logits = model(token)
            preds = torch.sigmoid(logits)
            all_preds.append(preds.cpu().numpy())
    
    return np.concatenate(all_preds, axis=0)

In [80]:
def run_training(fold_id, df):
    print(f"======== Running training for fold {fold_id} =============")
    
    # == init data module and model ==
    model = BioModule()
    datamodule = BioDataModule(df, fold_id)
    
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(
                                        monitor='valid_score',
                                          dirpath=paths.MODEL_WEIGHTS_DIR,
                                          save_top_k=1,
                                          save_last=False,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}",
                                          mode='max'
                                          )
    early_stop_callback = EarlyStopping(
        monitor='valid_score',
        mode="max", 
        patience=config.PATIENCE,
        verbose=True
        )
    callbacks_to_use = [checkpoint_callback,
                        early_stop_callback,
                        RichModelSummary(),
                        RichProgressBar(),
                       ]

    # == init trainer ==
    trainer = Trainer(
        max_epochs=config.EPOCHS,
        callbacks=callbacks_to_use,
        accelerator=device,
        devices=-1,  # 全ての利用可能なGPUを使用
        deterministic=False,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
        logger=TensorBoardLogger('lightning_logs', name=f'exp{exp_no}_fold{fold_id}'),
    )
    

    # == Training ==
    trainer.fit(model, datamodule=datamodule)
    
    del model, datamodule, trainer
    gc.collect()


def run_inference(fold_id, df):
    print(f"======== Inference for fold {fold_id} =============")

    # == init data module and model ==
    model = BioModule()
    datamodule = BioDataModule(df, fold_id)

    # infer only
    ckpt_path = find_latest_ckpt_path(fold_id, paths.MODEL_WEIGHTS_DIR) 
    weights = torch.load(ckpt_path)['state_dict']

    model.load_state_dict(weights)
    
    valid_df = datamodule.valid_df
    
    preds_oof = predict_in_batches(model, valid_df, 
                                #   df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                # df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                                    df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                                mode='valid')
    y_oof = valid_df[TARGETS].values
    
    score_BRD4, score_HSA, score_sEH, score = calc_score(preds_oof, y_oof)
    
    valid_df[[f'{target}_pred' for target in TARGETS]] = preds_oof
    
    print(f'fold:{fold_id} | CV score = {score}')
    
    df_test_temp = df_test.drop(['id'], axis=1)
    preds_test = predict_in_batches(model, df_test_temp, 
                                # df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                # df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                                df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                                mode='test')
    
    del model, datamodule, preds_oof, y_oof
    gc.collect()
    
    score_dict = {
        'BRD4':score_BRD4,
        "HSA":score_HSA,
        "sEH":score_sEH,
        "all":score
    }
    
    return preds_test, score_dict, valid_df

In [81]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

all_preds = []
score_list = []
score_list_BRD4 = []
score_list_HSA = []
score_list_sEH = []

def save_list_by_text(score_list, filename):
    # ファイルに書き込み
    score_list_txt = [str(loss) for loss in score_list]
    with open(paths.OUTPUT_DIR / f'{filename}.txt', 'w') as file:
        file.write(', '.join(score_list_txt))

# training
if not infer_only:
    for fold_id in fold_list:
        run_training(fold_id, df_train)

# inference
for fold_id in [0,1,2,3,4]:
    preds_test, score_dict, df_oof = run_inference(fold_id, df_train)
    
    # save score
    score_list_BRD4.append(score_dict['BRD4'])
    score_list_HSA.append(score_dict['HSA'])
    score_list_sEH.append(score_dict['sEH'])
    score_list.append(score_dict['all'])
    
    save_list_by_text(score_list, 'cv_all')
    save_list_by_text(score_list_BRD4, 'cv_BRD4')
    save_list_by_text(score_list_HSA, 'cv_HSA')
    save_list_by_text(score_list_sEH, 'cv_sEH')
    
    # save preds（foldごと）
    all_preds.append(preds_test) 
    
    df_oof.to_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}.parquet")
    
    del df_oof
    gc.collect()
    

df_oof_all = pd.DataFrame()
for fold_id in range(config.NUM_FOLDS):
    df_temp = pd.read_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}.parquet")
    df_oof_all = pd.concat([df_oof_all, df_temp], axis=0)

df_oof_all.to_parquet(paths.OUTPUT_DIR / f"oof_all.parquet")

# 古いckpt pathを削除
for fold in range(0, 5): 
    del_old_ckpt_path(fold, paths.MODEL_WEIGHTS_DIR)
    oof_path = paths.OUTPUT_DIR / f'oof_fold_{fold}.parquet'
    oof_path.unlink()



ModuleNotFoundError: `RichModelSummary` requires `rich` to be installed. Install it by running `pip install -U rich`.

In [76]:
preds = np.mean(all_preds, 0)

df_test['binds'] = 0
df_test.loc[df_test['protein_name']=='BRD4', 'binds'] = preds[df_test['protein_name']=='BRD4', 0]
df_test.loc[df_test['protein_name']=='HSA', 'binds'] = preds[df_test['protein_name']=='HSA', 1]
df_test.loc[df_test['protein_name']=='sEH', 'binds'] = preds[df_test['protein_name']=='sEH', 2]
df_test[['id', 'binds']].to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_1st.csv', index = False)

IndexError: invalid index to scalar variable.

In [77]:
# split sharedbb, nonsharedbb
df_sub = pd.read_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_1st.csv')

# load parquet dict data
with open(paths.DATA_DIR / 'my-data/test_id_dict.p', 'rb') as file:
    test_id_dict = pickle.load(file)
    
df_shared = df_sub.copy()
df_non_shared = df_sub.copy()

df_shared.loc[~df_shared['id'].isin(test_id_dict['shared_bb']), 'binds'] = 0
df_non_shared.loc[~df_shared['id'].isin(test_id_dict['non_shared_bb']), 'binds'] = 0

df_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_shared_bb_1st.csv', index = False)
df_non_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_non_shared_bb_1st.csv', index = False)

FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/KaggleLeashBio/notebooks/../output/exp059/exp059_submission_1st.csv'

In [None]:
raise Exception('end')

# **Pseudo labeling**

In [None]:
def pseudo_labeling(fold_id, df_pseudo):
    print(f"======== Running training for fold {fold_id} =============")
    
    df_pseudo = df_pseudo.copy()
    
    # load weight
    model = BioModule()
    ckpt_path = find_latest_ckpt_path(fold_id, paths.MODEL_WEIGHTS_DIR) 
    weights = torch.load(ckpt_path)['state_dict']
    model.load_state_dict(weights)
    
    preds_oof = predict_in_batches(model, df_pseudo, 
                                    df_test_bb1_rdkit,df_test_bb2_rdkit, df_test_bb3_rdkit, df_test_bb1_scf_rdkit,
                                    df_test_bb1_ecfp4,df_test_bb2_ecfp4, df_test_bb3_ecfp4, df_test_bb1_scf_ecfp4,
                                    mode='test')
    
    df_pseudo[TARGETS] = preds_oof
    
    df_pseudo.to_parquet(paths.OUTPUT_DIR / f"test_pseudo_label_fold_{fold_id}.parquet") 
    
    del model, weights, df_pseudo, preds_oof
    gc.collect()

In [None]:
for fold_id in [0,1,2,3,4]:
    pseudo_labeling(fold_id, df_pseudo)

# **Train with Pseudo-label**

In [None]:
# trainとtestの記述子をまとめる
df_bb1_rdkit = pd.concat([df_train_bb1_rdkit, df_test_bb1_rdkit], axis=0).reset_index(drop=True)
df_bb1_ecfp4 = pd.concat([df_train_bb1_ecfp4, df_test_bb1_ecfp4], axis=0).reset_index(drop=True)
df_bb2_rdkit = pd.concat([df_train_bb2_rdkit, df_test_bb2_rdkit], axis=0).reset_index(drop=True)
df_bb2_ecfp4 = pd.concat([df_train_bb2_ecfp4, df_test_bb2_ecfp4], axis=0).reset_index(drop=True)
df_bb3_rdkit = pd.concat([df_train_bb3_rdkit, df_test_bb3_rdkit], axis=0).reset_index(drop=True)
df_bb3_ecfp4 = pd.concat([df_train_bb3_ecfp4, df_test_bb3_ecfp4], axis=0).reset_index(drop=True)
df_bb1_scf_rdkit = pd.concat([df_train_bb1_scf_rdkit, df_test_bb1_scf_rdkit], axis=0).reset_index(drop=True)
df_bb1_scf_ecfp4 = pd.concat([df_train_bb1_scf_ecfp4, df_test_bb1_scf_ecfp4], axis=0).reset_index(drop=True)

# train, testを結合した分、testのidxにオフセットを加える
bb1_offset = len(df_train_bb1_rdkit)
bb2_offset = len(df_train_bb2_rdkit)
bb3_offset = len(df_train_bb3_rdkit)
bb1_scf_offset = len(df_train_bb1_scf_rdkit)

In [None]:
class BioPseudoLabelDataModule(LightningDataModule):
    def __init__(self, df_train, fold_id):
        super().__init__()
        
        self.train_df = df_train[df_train['fold'] != fold_id]
        self.valid_df = df_train[df_train['fold'] == fold_id]

    def train_dataloader(self):
        train_dataset = BioDataset(self.train_df, 
                                   df_bb1_rdkit,df_bb2_rdkit, df_bb3_rdkit, df_bb1_scf_rdkit,
                                    df_bb1_ecfp4,df_bb2_ecfp4, df_bb3_ecfp4, df_bb1_scf_ecfp4,
                                   mode='train')
        train_dataloader = torch.utils.data.DataLoader(
                                train_dataset,
                                batch_size=config.BATCH_SIZE,
                                shuffle=True,
                                num_workers=config.NUM_WORKERS,
                                pin_memory=True,
                                persistent_workers=True,
                                drop_last=True,
                            )
        return train_dataloader

    def val_dataloader(self):
        valid_dataset = BioDataset(self.valid_df, 
                                   df_bb1_rdkit,df_bb2_rdkit, df_bb3_rdkit, df_bb1_scf_rdkit,
                                    df_bb1_ecfp4,df_bb2_ecfp4, df_bb3_ecfp4, df_bb1_scf_ecfp4,
                                   mode='valid')
        valid_dataloader = torch.utils.data.DataLoader(
                                            valid_dataset,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=False,
                                            num_workers=config.NUM_WORKERS,
                                            pin_memory=True,
                                            persistent_workers=True,
                                            drop_last=False,
                                        )
        return valid_dataloader

In [None]:
def add_offset_to_idx(df_pseudo):
    # train_dataの分だけtest datanのidxにオフセットを加える
    df_pseudo_fold = df_pseudo.copy()
    df_pseudo_fold['buildingblock1_smiles'] += bb1_offset
    df_pseudo_fold['buildingblock2_smiles'] += bb2_offset
    df_pseudo_fold['buildingblock3_smiles'] += bb3_offset
    df_pseudo_fold['bb1_scaffold_idx'] += bb1_scf_offset

    return df_pseudo_fold

In [None]:
def run_training_with_pseudolabel(fold_id, df_train):
    print(f"======== Running training for fold {fold_id} =============")
    
    # pseudo_label付テストデータを読み込む
    df_pseudo_fold = pd.read_parquet(paths.OUTPUT_DIR / f"test_pseudo_label_fold_{fold_id}.parquet")
    df_pseudo_fold = add_offset_to_idx(df_pseudo_fold)
    df_pseudo_fold['fold'] = -1
    
    df = pd.concat([df_train, df_pseudo_fold], axis=0).reset_index(drop=True)
    
    # == init data module and model ==
    model = BioModule()
    datamodule = BioPseudoLabelDataModule(df, fold_id)
    
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(
                                        monitor='valid_score',
                                          dirpath=paths.MODEL_WEIGHTS_DIR,
                                          save_top_k=1,
                                          save_last=False,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}_2nd",
                                          mode='max'
                                          )
    early_stop_callback = EarlyStopping(
        monitor='valid_score',
        mode="max", 
        patience=config.PATIENCE,
        verbose=True
        )
    callbacks_to_use = [checkpoint_callback,
                        # early_stop_callback,
                        RichModelSummary(),
                        RichProgressBar(),
                       ]

    # == init trainer ==
    trainer = Trainer(
        max_epochs=config.EPOCHS,
        callbacks=callbacks_to_use,
        accelerator=device,
        devices=-1,  # 全ての利用可能なGPUを使用
        deterministic=False,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
        logger=TensorBoardLogger('lightning_logs', name=f'exp{exp_no}_fold{fold_id}_2nd'),
    )

    # == Training ==
    trainer.fit(model, datamodule=datamodule)
    weights = torch.load(checkpoint_callback.best_model_path)['state_dict']
        
    model.load_state_dict(weights)
    
    valid_df = datamodule.valid_df
    
    preds_oof = predict_in_batches(model, valid_df, 
                                    df_bb1_rdkit,df_bb2_rdkit, df_bb3_rdkit, df_bb1_scf_rdkit,
                                    df_bb1_ecfp4,df_bb2_ecfp4, df_bb3_ecfp4, df_bb1_scf_ecfp4,
                                   mode='valid')
    y_oof = valid_df[TARGETS].values
    
    score_BRD4, score_HSA, score_sEH, score = calc_score(preds_oof, y_oof)
    
    valid_df[[f'{target}_pred' for target in TARGETS]] = preds_oof
    
    print(f'fold:{fold_id} | CV score = {score}')
    
    df_test_temp = df_test.drop(['id'], axis=1)
    preds_test = predict_in_batches(model, df_test_temp, 
                                      df_test_bb1_rdkit,df_test_bb2_rdkit, df_test_bb3_rdkit, df_test_bb1_scf_rdkit,
                                    df_test_bb1_ecfp4,df_test_bb2_ecfp4, df_test_bb3_ecfp4, df_test_bb1_scf_ecfp4,
                                    mode='test')
    
    del model, datamodule, trainer, preds_oof, y_oof
    gc.collect()
    
    score_dict = {
        'BRD4':score_BRD4,
        "HSA":score_HSA,
        "sEH":score_sEH,
        "all":score
    }
    
    return preds_test, score_dict, valid_df

In [None]:
all_preds = []
score_list = []
score_list_BRD4 = []
score_list_HSA = []
score_list_sEH = []

def save_list_by_text(score_list, filename):
    # ファイルに書き込み
    score_list_txt = [str(loss) for loss in score_list]
    with open(paths.OUTPUT_DIR / f'{filename}.txt', 'w') as file:
        file.write(', '.join(score_list_txt))
    

for fold_id in range(config.NUM_FOLDS):
    
    preds_test, score_dict, df_oof = run_training_with_pseudolabel(fold_id, df_train)
    
    # save score
    score_list_BRD4.append(score_dict['BRD4'])
    score_list_HSA.append(score_dict['HSA'])
    score_list_sEH.append(score_dict['sEH'])
    score_list.append(score_dict['all'])
    
    save_list_by_text(score_list, 'cv_all_2nd')
    save_list_by_text(score_list_BRD4, 'cv_BRD4_2nd')
    save_list_by_text(score_list_HSA, 'cv_HSA_2nd')
    save_list_by_text(score_list_sEH, 'cv_sEH_2nd')
    
    # save preds（foldごと）
    all_preds.append(preds_test) 
    
    df_oof.to_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}_2nd.parquet")
    
    del df_oof
    gc.collect()
    

df_oof_all = pd.DataFrame()
for fold_id in range(config.NUM_FOLDS):
    df_temp = pd.read_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}_2nd.parquet")
    df_oof_all = pd.concat([df_oof_all, df_temp], axis=0)

df_oof_all.to_parquet(paths.OUTPUT_DIR / f"oof_all.parquet")

# **Submission**

In [None]:
df_test

In [None]:
preds = np.mean(all_preds, 0)

df_test['binds'] = 0
df_test.loc[df_test['protein_name']=='BRD4', 'binds'] = preds[df_test['protein_name']=='BRD4', 0]
df_test.loc[df_test['protein_name']=='HSA', 'binds'] = preds[df_test['protein_name']=='HSA', 1]
df_test.loc[df_test['protein_name']=='sEH', 'binds'] = preds[df_test['protein_name']=='sEH', 2]
df_test[['id', 'binds']].to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_2nd.csv', index = False)


In [None]:

# split sharedbb, nonsharedbb
df_sub = pd.read_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_2nd.csv')

# load parquet dict data
with open(paths.DATA_DIR / 'my-data/test_id_dict.p', 'rb') as file:
    test_id_dict = pickle.load(file)
    
df_shared = df_sub.copy()
df_non_shared = df_sub.copy()

df_shared.loc[~df_shared['id'].isin(test_id_dict['shared_bb']), 'binds'] = 0
df_non_shared.loc[~df_shared['id'].isin(test_id_dict['non_shared_bb']), 'binds'] = 0

df_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_shared_bb_2nd.csv', index = False)
df_non_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_non_shared_bb_2nd.csv', index = False)

In [None]:
# 古いckpt pathを削除
for fold in range(0, 5): 
    del_old_ckpt_path(fold, paths.MODEL_WEIGHTS_DIR)
    
    oof_path = paths.OUTPUT_DIR / f'oof_fold_{fold}.parquet'
    oof_path.unlink()