# Leash Bio

- positive data多めに使う
- EMAアルゴリズム
- EPOCH100, patience10

## ref
- https://www.kaggle.com/code/yyyu54/pytorch-version-belka-1dcnn-starter-with-all-data
- https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data/notebook

In [1]:
exp_no = '073'
DEBUG = False
data_ratio = 1

fold_list=[]

In [2]:
# !pip install rdkit
# !pip install mordred
!pip install timm

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
from sklearn.metrics import average_precision_score as APS
from sklearn.model_selection import StratifiedKFold

import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

from pytorch_lightning import LightningModule
from pytorch_lightning import LightningDataModule, Trainer
# seed_everything
from pytorch_lightning.callbacks import (
    ModelCheckpoint, 
    EarlyStopping,
    ModelCheckpoint,
    RichModelSummary,
    RichProgressBar,
)
from pytorch_lightning.loggers import TensorBoardLogger
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import timm
from timm.utils import ModelEmaV2

from funcs.utils import find_latest_ckpt_path, del_old_ckpt_path
from funcs.calc_descriptor import calc_rdkit_descriptors, calc_ecfp4_descriptors, calc_original_descriptors
from funcs.tokenize import tokenize_smiles

import warnings
warnings.simplefilter('ignore')

In [4]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / f"output/exp{exp_no}"
    
# set device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():    
    device = "cuda"
else:
    device = "cpu"
    
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print('Using', torch.cuda.device_count(), 'GPU(s)')
print('pytorch:', torch.__version__)

Using 1 GPU(s)
pytorch: 2.0.0


In [5]:
class config:
    SEED = 2024
    
    PREPROCESS = False
    EPOCHS = 20 #20
    PATIENCE = 10 #20
    BATCH_SIZE = 4096
    NUM_WORKERS = 16
    
    USE_EMA = False
    
    LR = 1e-3
    WEIGHT_DECAY = 1e-6
    MIXED_PRECISION = True
    
    NUM_FOLDS = 5    
    USE_NUM_FOLD = 1
    
class paths:    
    DATA_DIR = DATA_DIR
    OUTPUT_DIR = OUTPUT_DIR
    MODEL_WEIGHTS_DIR = OUTPUT_DIR / f"bio-models-exp{exp_no}"
    
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-data-capping"

    TRAIN_PATH = SHRUNKEN_DATA_DIR / "train.parquet"
    TEST_PATH = SHRUNKEN_DATA_DIR / "test.parquet"
    SUB_PATH = SHRUNKEN_DATA_DIR / "sub.parquet"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [6]:
print('fix seed')

def my_seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
# seed_everything(config.SEED, workers=True)
my_seed_everything(config.SEED)

fix seed


# **Loda Data**

In [7]:
bb_cols = ['buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
           'buildingblock1_smiles_scaffold', "buildingblock2_smiles_scaffold", "buildingblock3_smiles_scaffold",
           'fold']

TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

# ソフトラベリングを用意する
def soft_labeling(df_train):
    df_train = df_train.copy()
    bb1_mean = df_train.groupby('buildingblock1_smiles')[TARGETS].mean()
    bb2_mean = df_train.groupby('buildingblock2_smiles')[TARGETS].mean()
    bb3_mean = df_train.groupby('buildingblock3_smiles')[TARGETS].mean()

    for target in TARGETS:
        df_train[f'{target}_bb1'] = df_train['buildingblock1_smiles'].map(bb1_mean[target].to_dict())
        df_train[f'{target}_bb2'] = df_train['buildingblock2_smiles'].map(bb2_mean[target].to_dict())
        df_train[f'{target}_bb3'] = df_train['buildingblock3_smiles'].map(bb3_mean[target].to_dict())

    df_train['binds_BRD4'] = df_train['binds_BRD4'] + df_train['binds_BRD4_bb1'] + df_train['binds_BRD4_bb2'] + df_train['binds_BRD4_bb3']
    df_train['binds_HSA'] = df_train['binds_HSA'] + df_train['binds_HSA_bb1'] + df_train['binds_HSA_bb2'] + df_train['binds_HSA_bb3']
    df_train['binds_sEH'] = df_train['binds_sEH'] + df_train['binds_sEH_bb1'] + df_train['binds_sEH_bb2'] + df_train['binds_sEH_bb3']

    df_train[TARGETS] = df_train[TARGETS].clip(0, 1)

    df_train.drop(columns=[f'{target}_bb1' for target in TARGETS], inplace=True)
    df_train.drop(columns=[f'{target}_bb2' for target in TARGETS], inplace=True)
    df_train.drop(columns=[f'{target}_bb3' for target in TARGETS], inplace=True)
    
    return df_train

In [8]:


df_train = pd.read_parquet(paths.TRAIN_PATH, columns=bb_cols + TARGETS)


if DEBUG:
    df_train = df_train.sample(100000).reset_index(drop=True)
else:
    # 全てのpositiveサンプルとnegativeサンプルをあわせて、希望の数のdatasetができる様にする
    
    # ソフトラベリング
    df_train_soft = soft_labeling(df_train)
    
    positive = df_train_soft[(df_train[TARGETS]>0).any(axis=1)]
    negative = df_train_soft[(df_train[TARGETS]==0).all(axis=1)]

    len_train = int(len(df_train)*data_ratio)
    use_negative_sample = len_train - len(positive)

    df_train = pd.concat([negative.sample(use_negative_sample, random_state=config.SEED), positive],axis=0).reset_index(drop=True)

In [9]:
df_train.head()

Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,buildingblock1_smiles_scaffold,buildingblock2_smiles_scaffold,buildingblock3_smiles_scaffold,fold,binds_BRD4,binds_HSA,binds_sEH
0,130,905,842,62,770,892,3,0.002961,0.005599,0.013638
1,18,1361,943,91,343,626,0,0.025245,0.010702,0.016105
2,94,487,1623,43,507,678,1,0.00501,0.009277,0.013062
3,231,1383,469,43,365,779,1,0.007011,0.013403,0.011777
4,175,580,1579,70,406,602,2,0.006324,0.007886,0.465935


In [10]:
# submitt用のデータ
df_test = pd.read_parquet(paths.SUB_PATH)
df_test.head()

# preudolabeling用
cols = ['buildingblock1_smiles', 'buildingblock2_smiles',
       'buildingblock3_smiles', 'buildingblock1_smiles_scaffold',
       'buildingblock2_smiles_scaffold', 'buildingblock3_smiles_scaffold']
df_pseudo = df_test[cols].drop_duplicates().reset_index(drop=True)
df_pseudo.head()

Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,buildingblock1_smiles_scaffold,buildingblock2_smiles_scaffold,buildingblock3_smiles_scaffold
0,0,58,58,91,507,507
1,0,58,160,91,507,776
2,0,58,171,91,507,541
3,0,58,372,91,507,907
4,0,58,561,91,507,543


In [11]:
# 変換用辞書を読み込む
with open(paths.SHRUNKEN_DATA_DIR / 'bb1_smiles2idx.pickle', mode='rb') as f:
    bb1_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb23_smiles2idx.pickle', mode='rb') as f:
    bb23_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb1_scaffold_smiles2idx.pickle', mode='rb') as f:
    bb1_scaffold_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb23_scaffold_smiles2idx.pickle', mode='rb') as f:
    bb23_scaffold_smiles2idx = pickle.load(f)
    
bb1_idx2smiles = {v:k for k,v in bb1_smiles2idx.items()}
bb23_idx2smiles = {v:k for k,v in bb23_smiles2idx.items()}
bb1_scaffold_idx2smiles = {v:k for k,v in bb1_scaffold_smiles2idx.items()}
bb23_scaffold_idx2smiles = {v:k for k,v in bb23_scaffold_smiles2idx.items()}

# **Make Features**

In [12]:
# 標準化
from sklearn.preprocessing import StandardScaler

def standardization(df_list):
    # 複数のdfをまとめて標準化
    df_all = pd.concat(df_list,axis=0)
    df_all.drop_duplicates(inplace=True)
    df_all.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # 標準偏差が0の列を削除
    df_all = df_all.loc[:, df_all.std() != 0]

    # standard scaling
    scaler = StandardScaler()
    scaler.fit(df_all)

    standardized_df_list = []
    for df_temp in df_list:
        df_temp = df_temp.loc[:, df_all.columns]
        df_temp_std = pd.DataFrame(scaler.transform(df_temp), 
                                index=df_temp.index, 
                                columns=df_temp.columns)
        standardized_df_list.append(df_temp_std)
        
    return standardized_df_list


def remove_std0(df_list):
    # 標準偏差が0の列を削除
    df_all = pd.concat(df_list,axis=0)
    df_all.drop_duplicates(inplace=True)
    df_all = df_all.loc[:, df_all.std() != 0]
    
    standardized_df_list = []
    for df_temp in df_list:
        df_temp = df_temp.loc[:, df_all.columns]
        standardized_df_list.append(df_temp)
        
    return standardized_df_list

In [13]:
df_bb1_rdkit = calc_rdkit_descriptors(bb1_idx2smiles)
df_bb23_rdkit = calc_rdkit_descriptors(bb23_idx2smiles)
df_bb1_scf_rdkit = calc_rdkit_descriptors(bb1_scaffold_idx2smiles)
df_bb23_scf_rdkit = calc_rdkit_descriptors(bb23_scaffold_idx2smiles)

# calc original descriptors
df_bb1_desc = calc_original_descriptors(bb1_idx2smiles)
df_bb23_desc = calc_original_descriptors(bb23_idx2smiles)
df_bb1_scf_desc = calc_original_descriptors(bb1_scaffold_idx2smiles)
df_bb23_scf_desc = calc_original_descriptors(bb23_scaffold_idx2smiles)

# ecfp4 descriptors
df_bb1_ecfp4 = calc_ecfp4_descriptors(bb1_idx2smiles)
df_bb23_ecfp4 = calc_ecfp4_descriptors(bb23_idx2smiles)
df_bb1_scf_ecfp4 = calc_ecfp4_descriptors(bb1_scaffold_idx2smiles)
df_bb23_scf_ecfp4 = calc_ecfp4_descriptors(bb23_scaffold_idx2smiles)





In [14]:
# ecfp4 descriptors
df_bb1_rdkit = pd.concat([df_bb1_rdkit, df_bb1_desc], axis=1)
df_bb23_rdkit = pd.concat([df_bb23_rdkit, df_bb23_desc], axis=1)
df_bb1_scf_rdkit = pd.concat([df_bb1_scf_rdkit, df_bb1_scf_desc], axis=1)
df_bb23_scf_rdkit = pd.concat([df_bb23_scf_rdkit, df_bb23_scf_desc], axis=1)

# df_bb1_ecfp4.columns = [i for i in range(len(df_bb1_ecfp4.columns))]
# df_bb23_ecfp4.columns = [i for i in range(len(df_bb23_ecfp4.columns))]
# df_bb1_scf_ecfp4.columns = [i for i in range(len(df_bb1_scf_ecfp4.columns))]
# df_bb23_scf_ecfp4.columns = [i for i in range(len(df_bb23_scf_ecfp4.columns))]
                             

In [15]:
# Rdkit記述子をまとめて標準化
df_list_rdkit = [
            df_bb1_rdkit,
            df_bb23_rdkit, 
            df_bb1_scf_rdkit, 
            df_bb23_scf_rdkit,
            ]
df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit = standardization(df_list_rdkit)
        
# ECFP4記述子をまとめて標準化
df_list_ecfp4 = [
            df_bb1_ecfp4,
            df_bb23_ecfp4, 
            df_bb1_scf_ecfp4, 
            df_bb23_scf_ecfp4,
            ]
df_bb1_ecfp4,df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4 = remove_std0(df_list_ecfp4)


In [16]:
len_rdkit = df_bb1_rdkit.shape[1]
len_ecfp4 = df_bb1_ecfp4.shape[1]
print(len_rdkit, len_ecfp4)

198 1023


# **Dataset & DataModule**

In [17]:
class BioDataset(torch.utils.data.Dataset):
    
    def __init__(
        self,
        df: pd.DataFrame,
        df_bb1_desc: pd.DataFrame,
        df_bb23_desc: pd.DataFrame,
        df_bb1_scf_desc: pd.DataFrame,
        df_bb23_scf_desc: pd.DataFrame,
        df_bb1_ecfp: pd.DataFrame,
        df_bb23_ecfp: pd.DataFrame,
        df_bb1_scf_ecfp: pd.DataFrame,
        df_bb23_scf_ecfp: pd.DataFrame,
        mode = 'train'
    ):
        super().__init__()
        
        assert mode in ['train', 'valid', 'test']
        self.mode = mode
        
        meta_cols = ["buildingblock1_smiles", # 0
                     "buildingblock2_smiles", # 1
                     "buildingblock3_smiles", # 2
                     "buildingblock1_smiles_scaffold", # 3
                     "buildingblock2_smiles_scaffold", # 4
                     "buildingblock3_smiles_scaffold", # 5    
                     ]
        if (self.mode == 'train') or (self.mode == 'valid'):
            meta_cols += TARGETS
            
        self.df = df[meta_cols].values
        self.bb1_desc = df_bb1_desc.values
        self.bb23_desc = df_bb23_desc.values
        self.bb1_scf_desc = df_bb1_scf_desc.values
        self.bb23_scf_desc = df_bb23_scf_desc.values
        
        self.bb1_ecfp = df_bb1_ecfp.values
        self.bb23_ecfp = df_bb23_ecfp.values
        self.bb1_scf_ecfp = df_bb1_scf_ecfp.values
        self.bb23_scf_ecfp = df_bb23_scf_ecfp.values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        
        row = self.df[index, :]

        bb1_desc = self.bb1_desc[int(row[0]), :]
        bb2_desc = self.bb23_desc[int(row[1]), :]
        bb3_desc = self.bb23_desc[int(row[2]), :]
        bb1_scf_desc = self.bb1_scf_desc[int(row[3]), :]
        bb2_scf_desc = self.bb23_scf_desc[int(row[4]), :]
        bb3_scf_desc = self.bb23_scf_desc[int(row[5]), :]
        
        bb1_ecfp = self.bb1_ecfp[int(row[0]), :]
        bb2_ecfp = self.bb23_ecfp[int(row[1]), :]
        bb3_ecfp = self.bb23_ecfp[int(row[2]), :]
        bb1_scf_ecfp = self.bb1_scf_ecfp[int(row[3]), :]
        bb2_scf_ecfp = self.bb23_scf_ecfp[int(row[4]), :]
        bb3_scf_ecfp = self.bb23_scf_ecfp[int(row[5]), :]
        
        if self.mode == 'train':
            bb2_desc, bb3_desc, bb2_scf_desc, bb3_scf_desc, bb2_ecfp, bb3_ecfp, bb2_scf_ecfp, bb3_scf_ecfp =\
                self.augment(bb2_desc, bb3_desc, bb2_scf_desc, bb3_scf_desc, bb2_ecfp, bb3_ecfp, bb2_scf_ecfp, bb3_scf_ecfp)
        
        X = np.concatenate([bb1_desc, bb2_desc, bb3_desc, bb1_scf_desc, bb2_scf_desc, bb3_scf_desc,
                             bb1_ecfp, bb2_ecfp, bb3_ecfp, bb1_scf_ecfp, bb2_scf_ecfp, bb3_scf_ecfp])
        
        if (self.mode == 'train') or (self.mode == 'valid'):
            y = row[-3:]
        else:
            y = np.zeros(3)
        
        output = {
            'X': torch.tensor(X, dtype=torch.float32),
            'y': torch.tensor(y, dtype=torch.float16)
        }        
        return output
    
    def augment(self, 
                bb2_desc, bb3_desc, bb2_scf_desc, bb3_scf_desc,
                bb2_ecfp, bb3_ecfp, bb2_scf_ecfp, bb3_scf_ecfp):
        """0.5の確率でx2とx3を入れ替えるaugmentation"""
        if np.random.rand() < 0.5:
            bb2_desc, bb3_desc = bb3_desc, bb2_desc
            bb2_scf_desc, bb3_scf_desc = bb3_scf_desc, bb2_scf_desc
            bb2_ecfp, bb3_ecfp = bb3_ecfp, bb2_ecfp
            bb2_scf_ecfp, bb3_scf_ecfp = bb3_scf_ecfp, bb2_scf_ecfp
        return bb2_desc, bb3_desc, bb2_scf_desc, bb3_scf_desc, bb2_ecfp, bb3_ecfp, bb2_scf_ecfp, bb3_scf_ecfp

In [18]:
df_bb1_ecfp4.isnull().sum().sum()

0

In [19]:
# Check Dataset
if DEBUG:
    dataset = BioDataset(df_train, 
                            df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                            df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                            mode='valid')
    X = dataset[0]['X']
    y = dataset[0]['y']
    print(X.shape)
    print(y.shape)

In [20]:
# lightning data module
class BioDataModule(LightningDataModule):
    def __init__(self, df_train, fold_id):
        super().__init__()
        
        self.train_df = df_train[df_train['fold'] != fold_id]
        self.valid_df = df_train[df_train['fold'] == fold_id]

    def train_dataloader(self):
        train_dataset = BioDataset(self.train_df, 
                                df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                                   mode='train')
        train_dataloader = torch.utils.data.DataLoader(
                                train_dataset,
                                batch_size=config.BATCH_SIZE,
                                shuffle=True,
                                num_workers=config.NUM_WORKERS,
                                pin_memory=True,
                                persistent_workers=True,
                                drop_last=True,
                            )
        return train_dataloader

    def val_dataloader(self):
        valid_dataset = BioDataset(self.valid_df, 
                                df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
                                   mode='valid')
        valid_dataloader = torch.utils.data.DataLoader(
                                            valid_dataset,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=False,
                                            num_workers=config.NUM_WORKERS,
                                            pin_memory=True,
                                            persistent_workers=True,
                                            drop_last=False,
                                        )
        return valid_dataloader

# **Model**

In [21]:
class BioModel(nn.Module):
    def __init__(self, 
                 input_len1,
                 input_len2,
                 output_dim=3):
        super(BioModel, self).__init__()
        
        self.input_len1 = input_len1
        self.input_len2 = input_len2
        self.output_dim = output_dim
        
        # それぞれの記述子のFC（desc1）
        self.feature_extractor_bb1_desc1 = self._make_feature_extractor(input_len1, 128)
        self.feature_extractor_bb23_desc1 = self._make_feature_extractor(input_len1, 128)
        self.feature_extractor_bb1scf_desc1 = self._make_feature_extractor(input_len1, 128)
#         self.feature_extractor_bb23scf_desc1 = self._make_feature_extractor(input_len1, 128)
        
         # それぞれの記述子のFC（desc2）
        self.feature_extractor_bb1_desc2 = self._make_feature_extractor(input_len2,128)
        self.feature_extractor_bb23_desc2 = self._make_feature_extractor(input_len2,128)
        self.feature_extractor_bb1scf_desc2 = self._make_feature_extractor(input_len2,128)
#         self.feature_extractor_bb23scf_desc2 = self._make_feature_extractor(input_len2,128)
            
        # それぞれのBBのFC
        self.feature_extractor_bb1 = self._make_feature_extractor(128*2, 324)
        self.feature_extractor_bb23 = self._make_feature_extractor(128*2, 324)
        self.feature_extractor_bb1scf = self._make_feature_extractor(128*2, 324)
#         self.feature_extractor_bb23scf = self._make_feature_extractor(128*2, 324)

        # head
        self.head = nn.Sequential(
            nn.Linear(324*4, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(512, 3),
        )
        
    def _make_feature_extractor(self, input_len, output_len):
        return nn.Sequential(
                            nn.Linear(input_len, output_len),
                            nn.BatchNorm1d(output_len),
                            nn.Dropout(0.1),
                            nn.ReLU(),
                        )
        

    def forward(self, x):
        desc1 = x[:, :self.input_len1*6]
        desc2 = x[:, self.input_len1*6:]
        
        bb1_desc1 = desc1[:, :self.input_len1]
        bb2_desc1 = desc1[:, self.input_len1:self.input_len1*2]
        bb3_desc1 = desc1[:, self.input_len1*2:self.input_len1*3]
        bb1_scf_desc1 = desc1[:, self.input_len1*3:self.input_len1*4]
#         bb2_scf_desc1 = desc1[:, self.input_len1*4:self.input_len1*5]
#         bb3_scf_desc1 = desc1[:, self.input_len1*5:]
        
        bb1_desc2 = desc2[:, :self.input_len2]
        bb2_desc2 = desc2[:, self.input_len2:self.input_len2*2]
        bb3_desc2 = desc2[:, self.input_len2*2:self.input_len2*3]
        bb1_scf_desc2 = desc2[:, self.input_len2*3:self.input_len2*4]
#         bb2_scf_desc2 = desc2[:, self.input_len2*4:self.input_len2*5]
#         bb3_scf_desc2 = desc2[:, self.input_len2*5:]
        
        # 各BB, 各記述子のFC
        bb1_desc1 = self.feature_extractor_bb1_desc1(bb1_desc1)
        bb2_desc1 = self.feature_extractor_bb23_desc1(bb2_desc1)
        bb3_desc1 = self.feature_extractor_bb23_desc1(bb3_desc1)
        bb1_scf_desc1 = self.feature_extractor_bb1scf_desc1(bb1_scf_desc1)
#         bb2_scf_desc1 = self.feature_extractor_bb23scf_desc1(bb2_scf_desc1)
#         bb3_scf_desc1 = self.feature_extractor_bb23scf_desc1(bb3_scf_desc1)
        
        bb1_desc2 = self.feature_extractor_bb1_desc2(bb1_desc2)
        bb2_desc2 = self.feature_extractor_bb23_desc2(bb2_desc2)
        bb3_desc2 = self.feature_extractor_bb23_desc2(bb3_desc2)
        bb1_scf_desc2 = self.feature_extractor_bb1scf_desc2(bb1_scf_desc2)
#         bb2_scf_desc2 = self.feature_extractor_bb23scf_desc2(bb2_scf_desc2)
#         bb3_scf_desc2 = self.feature_extractor_bb23scf_desc2(bb3_scf_desc2)
        
        # desc1(rdkit)とdesc2(ecfp4)をconcat
        bb1 = torch.cat([bb1_desc1, bb1_desc2], dim=1)
        bb2 = torch.cat([bb2_desc1, bb2_desc2], dim=1)
        bb3 = torch.cat([bb3_desc1, bb3_desc2], dim=1)
        bb1scf = torch.cat([bb1_scf_desc1, bb1_scf_desc2], dim=1)
#         bb2scf = torch.cat([bb2_scf_desc1, bb2_scf_desc2], dim=1)
#         bb3scf = torch.cat([bb3_scf_desc1, bb3_scf_desc2], dim=1)
        
        # 各BBのFC
        bb1 = self.feature_extractor_bb1(bb1)
        bb2 = self.feature_extractor_bb23(bb2)
        bb3 = self.feature_extractor_bb23(bb3)
        
        # ↓もともとここにミスがあったかも？
        bb1scf = self.feature_extractor_bb1scf(bb1scf)
#         bb2scf = self.feature_extractor_bb23scf(bb2scf)
#         bb3scf = self.feature_extractor_bb23scf(bb3scf)
        
        X = torch.cat([bb1, bb2, bb3, bb1scf, 
#                        bb2scf, bb3scf
                      ], dim=1)
        
        output = self.head(X)
        
        return output

In [22]:
# check model
if DEBUG:
    dummy_model = BioModel(input_len1=len_rdkit, input_len2=len_ecfp4)
    total_params = sum(p.numel() for p in dummy_model.parameters())
    print(f"Total number of parameters: {total_params}")

    dummy_input = torch.rand((64, (len_rdkit+len_ecfp4)*6), dtype=torch.float32)
    output = dummy_model(dummy_input)
    print(output.shape)
    # print(output)

# **Lightning Module**

In [23]:
def calc_score(y_preds, y_true):
    
    y_true[y_true < 1] = 0
    
    score_BRD4 = APS(y_true[:,0], y_preds[:,0])
    score_HSA = APS(y_true[:,1], y_preds[:,1])
    score_sEH = APS(y_true[:,2], y_preds[:,2])
    score = (score_BRD4 + score_HSA + score_sEH) / 3
    
    return score_BRD4, score_HSA, score_sEH, score

In [24]:
class BioModule(LightningModule):
    def __init__(self):
        
        super(BioModule, self).__init__()
       
        self.model = BioModel(input_len1=len_rdkit, input_len2=len_ecfp4)
        
        if config.USE_EMA:
            self.ema = ModelEmaV2(self.model, decay=0.999)
        
        self.validation_step_outputs = []
        self.loss_func = nn.BCEWithLogitsLoss()
        
    def forward(self, X):
        pred = self.model(X)
        return pred
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        # == define learning rate scheduler ==
        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer,
            T_0=config.EPOCHS,
            T_mult=1,
            eta_min=1e-6,
            last_epoch=-1
        )
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'valid_loss_epoch',
                'frequency': 1
            }
        }
        
    def training_step(self, batch, batch_idx):
        
        X, y = batch.pop('X'), batch.pop('y')
        logits = self(X)
        train_loss = self.loss_func(logits, y)
        
        self.log('train_loss', train_loss,  on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=X.size(0))
        
        # EMAの更新
        if config.USE_EMA:
            self.ema.update(self.model)
        
        return train_loss

    def validation_step(self, batch, batch_idx):
        
        X, y = batch.pop('X'), batch.pop('y')
        logits = self(X)
        preds = torch.sigmoid(logits)
        
        valid_loss = self.loss_func(logits, y)
        
        self.log('valid_loss', valid_loss, on_step=True, on_epoch=False, prog_bar=True, logger=True, batch_size=X.size(0))
        
        self.validation_step_outputs.append({"valid_loss":valid_loss, "preds":preds, "targets":y})
        
        return valid_loss

    
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader
    
    def calc_score(self, y_preds, y_true):
        return calc_score(y_preds, y_true)

    
    def on_validation_epoch_end(self):
        
        outputs = self.validation_step_outputs
        
        # 各iterationごとのlossを平均
        avg_loss = torch.stack([x['valid_loss'] for x in outputs]).mean()
        self.log("valid_loss_epoch", avg_loss, prog_bar=True, logger=True)
        
        # scoreを計算
        y_preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()
        y_true = torch.cat([x['targets'] for x in outputs]).detach().cpu().numpy()
        
        score = self.calc_score(y_preds, y_true)[-1]
        self.log("valid_score", score, prog_bar=True, logger=True)
        
        self.validation_step_outputs.clear()
        
        return {'valid_loss_epoch': avg_loss, "valid_score":score}

# Train & Inference

In [25]:
def predict_in_batches(model, df, 
                       df_bb1_1, df_bb2_1, df_bb3_1, df_bb1_scf_1, 
                       df_bb1_2, df_bb2_2, df_bb3_2, df_bb1_scf_2, 
                       mode):
    
    model.to(device)
    model.eval()
    
    dataset = BioDataset(df, 
                          df_bb1_1, df_bb2_1, df_bb3_1, df_bb1_scf_1, 
                         df_bb1_2, df_bb2_2, df_bb3_2, df_bb1_scf_2, 
                         mode=mode)
    dataloader = torch.utils.data.DataLoader(
                                        dataset,
                                        batch_size=config.BATCH_SIZE,
                                        shuffle=False,
                                        num_workers=config.NUM_WORKERS,
                                        pin_memory=True,
                                        persistent_workers=True,
                                        drop_last=False,
                                    )

    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['X'].to(device)
            logits = model(inputs)
            preds = torch.sigmoid(logits)
            all_preds.append(preds.cpu().numpy())
    
    return np.concatenate(all_preds, axis=0)

In [26]:
def run_training(fold_id, df):
    print(f"======== Running training for fold {fold_id} =============")
    
    # == init data module and model ==
    model = BioModule()
    datamodule = BioDataModule(df, fold_id)
    
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(
                                        monitor='valid_score',
                                          dirpath=paths.MODEL_WEIGHTS_DIR,
                                          save_top_k=1,
                                          save_last=False,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}",
                                          mode='max'
                                          )
    early_stop_callback = EarlyStopping(
        monitor='valid_score',
        mode="max", 
        patience=config.PATIENCE,
        verbose=True
        )
    callbacks_to_use = [checkpoint_callback,
                        early_stop_callback,
                        RichModelSummary(),
                        RichProgressBar(),
                       ]

    # == init trainer ==
    trainer = Trainer(
        max_epochs=config.EPOCHS,
        callbacks=callbacks_to_use,
        accelerator=device,
        devices=-1,  # 全ての利用可能なGPUを使用
        deterministic=False,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
        logger=TensorBoardLogger('lightning_logs', name=f'exp{exp_no}_fold{fold_id}'),
    )
    

    # == Training ==
    trainer.fit(model, datamodule=datamodule)
    # weights = torch.load(checkpoint_callback.best_model_path)['state_dict']
    
    del model, datamodule, trainer
    gc.collect()


def run_inference(fold_id, df):
    print(f"======== Inference for fold {fold_id} =============")

    # == init data module and model ==
    model = BioModule()
    datamodule = BioDataModule(df, fold_id)

    # infer only
    ckpt_path = find_latest_ckpt_path(fold_id, paths.MODEL_WEIGHTS_DIR) 
    weights = torch.load(ckpt_path)['state_dict']

    model.load_state_dict(weights)
    
    valid_df = datamodule.valid_df
    
    preds_oof = predict_in_batches(model, valid_df, 
                                  df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
#                                     df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                                mode='valid')
    y_oof = valid_df[TARGETS].values
    
    score_BRD4, score_HSA, score_sEH, score = calc_score(preds_oof, y_oof)
    
    valid_df[[f'{target}_pred' for target in TARGETS]] = preds_oof
    
    print(f'fold:{fold_id} | CV score = {score}')
    
    df_test_temp = df_test.drop(['id'], axis=1)
    preds_test = predict_in_batches(model, df_test_temp, 
                                df_bb1_rdkit, df_bb23_rdkit, df_bb1_scf_rdkit, df_bb23_scf_rdkit,
                                df_bb1_ecfp4, df_bb23_ecfp4, df_bb1_scf_ecfp4, df_bb23_scf_ecfp4,
#                                      df_bb1_token, df_bb23_token, df_bb1_scf_token, df_bb23_scf_token,
                                mode='test')
    
    del model, datamodule, preds_oof, y_oof
    gc.collect()
    
    score_dict = {
        'BRD4':score_BRD4,
        "HSA":score_HSA,
        "sEH":score_sEH,
        "all":score
    }
    
    return preds_test, score_dict, valid_df

In [27]:
gc.collect()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

all_preds = []
score_list = []
score_list_BRD4 = []
score_list_HSA = []
score_list_sEH = []

def save_list_by_text(score_list, filename):
    # ファイルに書き込み
    score_list_txt = [str(loss) for loss in score_list]
    with open(paths.OUTPUT_DIR / f'{filename}.txt', 'w') as file:
        file.write(', '.join(score_list_txt))

# training
for fold_id in fold_list:
    run_training(fold_id, df_train)

# inference
for fold_id in [0,1,2,3,4]:
    preds_test, score_dict, df_oof = run_inference(fold_id, df_train)
    
    # save score
    score_list_BRD4.append(score_dict['BRD4'])
    score_list_HSA.append(score_dict['HSA'])
    score_list_sEH.append(score_dict['sEH'])
    score_list.append(score_dict['all'])
    
    save_list_by_text(score_list, 'cv_all')
    save_list_by_text(score_list_BRD4, 'cv_BRD4')
    save_list_by_text(score_list_HSA, 'cv_HSA')
    save_list_by_text(score_list_sEH, 'cv_sEH')
    
    # save preds（foldごと）
    all_preds.append(preds_test) 
    
    df_oof.to_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}.parquet")
    
    del df_oof
    gc.collect()
    

df_oof_all = pd.DataFrame()
for fold_id in range(config.NUM_FOLDS):
    df_temp = pd.read_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}.parquet")
    df_oof_all = pd.concat([df_oof_all, df_temp], axis=0)

df_oof_all.to_parquet(paths.OUTPUT_DIR / f"oof_all.parquet")

# 古いckpt pathを削除
for fold in range(0, 5): 
    del_old_ckpt_path(fold, paths.MODEL_WEIGHTS_DIR)
    oof_path = paths.OUTPUT_DIR / f'oof_fold_{fold}.parquet'
    oof_path.unlink()

Latest checkpoint file: /home/working/notebooks/../output/exp073/bio-models-exp073/fold_0.ckpt
fold:0 | CV score = 0.2705225053430353
Latest checkpoint file: /home/working/notebooks/../output/exp073/bio-models-exp073/fold_1.ckpt
fold:1 | CV score = 0.3196267274526359
Latest checkpoint file: /home/working/notebooks/../output/exp073/bio-models-exp073/fold_2.ckpt
fold:2 | CV score = 0.24038834607248052
Latest checkpoint file: /home/working/notebooks/../output/exp073/bio-models-exp073/fold_3.ckpt
fold:3 | CV score = 0.3002557371047689
Latest checkpoint file: /home/working/notebooks/../output/exp073/bio-models-exp073/fold_4.ckpt
fold:4 | CV score = 0.3371607940639312
find 1 ckpts
[PosixPath('/home/working/notebooks/../output/exp073/bio-models-exp073/fold_0.ckpt')]
Latest checkpoint file: /home/working/notebooks/../output/exp073/bio-models-exp073/fold_0.ckpt
find 1 ckpts
[PosixPath('/home/working/notebooks/../output/exp073/bio-models-exp073/fold_1.ckpt')]
Latest checkpoint file: /home/workin

In [28]:
# preds = np.mean(all_preds, 0)

# df_test['binds'] = 0
# df_test.loc[df_test['protein_name']=='BRD4', 'binds'] = preds[df_test['protein_name']=='BRD4', 0]
# df_test.loc[df_test['protein_name']=='HSA', 'binds'] = preds[df_test['protein_name']=='HSA', 1]
# df_test.loc[df_test['protein_name']=='sEH', 'binds'] = preds[df_test['protein_name']=='sEH', 2]
# df_test[['id', 'binds']].to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_1st.csv', index = False)

In [29]:
# # split sharedbb, nonsharedbb
# df_sub = pd.read_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_1st.csv')

# # load parquet dict data
# with open(paths.DATA_DIR / 'my-data/test_id_dict.p', 'rb') as file:
#     test_id_dict = pickle.load(file)
    
# df_shared = df_sub.copy()
# df_non_shared = df_sub.copy()

# df_shared.loc[~df_shared['id'].isin(test_id_dict['shared_bb']), 'binds'] = 0
# df_non_shared.loc[~df_shared['id'].isin(test_id_dict['non_shared_bb']), 'binds'] = 0

# df_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_shared_bb_1st.csv', index = False)
# df_non_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_non_shared_bb_1st.csv', index = False)

In [30]:
raise Exception('end')

Exception: end

# **Pseudo labeling**

In [None]:
def pseudo_labeling(fold_id, df_pseudo):
    print(f"======== Running training for fold {fold_id} =============")
    
    df_pseudo = df_pseudo.copy()
    
    # load weight
    model = BioModule()
    ckpt_path = find_latest_ckpt_path(fold_id, paths.MODEL_WEIGHTS_DIR) 
    weights = torch.load(ckpt_path)['state_dict']
    model.load_state_dict(weights)
    
    preds_oof = predict_in_batches(model, df_pseudo, 
                                    df_test_bb1_rdkit,df_test_bb2_rdkit, df_test_bb3_rdkit, df_test_bb1_scf_rdkit,
                                    df_test_bb1_ecfp4,df_test_bb2_ecfp4, df_test_bb3_ecfp4, df_test_bb1_scf_ecfp4,
                                    mode='test')
    
    df_pseudo[TARGETS] = preds_oof
    
    df_pseudo.to_parquet(paths.OUTPUT_DIR / f"test_pseudo_label_fold_{fold_id}.parquet") 
    
    del model, weights, df_pseudo, preds_oof
    gc.collect()

In [None]:
for fold_id in [0,1,2,3,4]:
    pseudo_labeling(fold_id, df_pseudo)

# **Train with Pseudo-label**

In [None]:
# trainとtestの記述子をまとめる
df_bb1_rdkit = pd.concat([df_train_bb1_rdkit, df_test_bb1_rdkit], axis=0).reset_index(drop=True)
df_bb1_ecfp4 = pd.concat([df_train_bb1_ecfp4, df_test_bb1_ecfp4], axis=0).reset_index(drop=True)
df_bb2_rdkit = pd.concat([df_train_bb2_rdkit, df_test_bb2_rdkit], axis=0).reset_index(drop=True)
df_bb2_ecfp4 = pd.concat([df_train_bb2_ecfp4, df_test_bb2_ecfp4], axis=0).reset_index(drop=True)
df_bb3_rdkit = pd.concat([df_train_bb3_rdkit, df_test_bb3_rdkit], axis=0).reset_index(drop=True)
df_bb3_ecfp4 = pd.concat([df_train_bb3_ecfp4, df_test_bb3_ecfp4], axis=0).reset_index(drop=True)
df_bb1_scf_rdkit = pd.concat([df_train_bb1_scf_rdkit, df_test_bb1_scf_rdkit], axis=0).reset_index(drop=True)
df_bb1_scf_ecfp4 = pd.concat([df_train_bb1_scf_ecfp4, df_test_bb1_scf_ecfp4], axis=0).reset_index(drop=True)

# train, testを結合した分、testのidxにオフセットを加える
bb1_offset = len(df_train_bb1_rdkit)
bb2_offset = len(df_train_bb2_rdkit)
bb3_offset = len(df_train_bb3_rdkit)
bb1_scf_offset = len(df_train_bb1_scf_rdkit)

In [None]:
class BioPseudoLabelDataModule(LightningDataModule):
    def __init__(self, df_train, fold_id):
        super().__init__()
        
        self.train_df = df_train[df_train['fold'] != fold_id]
        self.valid_df = df_train[df_train['fold'] == fold_id]

    def train_dataloader(self):
        train_dataset = BioDataset(self.train_df, 
                                   df_bb1_rdkit,df_bb2_rdkit, df_bb3_rdkit, df_bb1_scf_rdkit,
                                    df_bb1_ecfp4,df_bb2_ecfp4, df_bb3_ecfp4, df_bb1_scf_ecfp4,
                                   mode='train')
        train_dataloader = torch.utils.data.DataLoader(
                                train_dataset,
                                batch_size=config.BATCH_SIZE,
                                shuffle=True,
                                num_workers=config.NUM_WORKERS,
                                pin_memory=True,
                                persistent_workers=True,
                                drop_last=True,
                            )
        return train_dataloader

    def val_dataloader(self):
        valid_dataset = BioDataset(self.valid_df, 
                                   df_bb1_rdkit,df_bb2_rdkit, df_bb3_rdkit, df_bb1_scf_rdkit,
                                    df_bb1_ecfp4,df_bb2_ecfp4, df_bb3_ecfp4, df_bb1_scf_ecfp4,
                                   mode='valid')
        valid_dataloader = torch.utils.data.DataLoader(
                                            valid_dataset,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=False,
                                            num_workers=config.NUM_WORKERS,
                                            pin_memory=True,
                                            persistent_workers=True,
                                            drop_last=False,
                                        )
        return valid_dataloader

In [None]:
def add_offset_to_idx(df_pseudo):
    # train_dataの分だけtest datanのidxにオフセットを加える
    df_pseudo_fold = df_pseudo.copy()
    df_pseudo_fold['buildingblock1_smiles'] += bb1_offset
    df_pseudo_fold['buildingblock2_smiles'] += bb2_offset
    df_pseudo_fold['buildingblock3_smiles'] += bb3_offset
    df_pseudo_fold['bb1_scaffold_idx'] += bb1_scf_offset

    return df_pseudo_fold

In [None]:
def run_training_with_pseudolabel(fold_id, df_train):
    print(f"======== Running training for fold {fold_id} =============")
    
    # pseudo_label付テストデータを読み込む
    df_pseudo_fold = pd.read_parquet(paths.OUTPUT_DIR / f"test_pseudo_label_fold_{fold_id}.parquet")
    df_pseudo_fold = add_offset_to_idx(df_pseudo_fold)
    df_pseudo_fold['fold'] = -1
    
    df = pd.concat([df_train, df_pseudo_fold], axis=0).reset_index(drop=True)
    
    # == init data module and model ==
    model = BioModule()
    datamodule = BioPseudoLabelDataModule(df, fold_id)
    
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(
                                        monitor='valid_score',
                                          dirpath=paths.MODEL_WEIGHTS_DIR,
                                          save_top_k=1,
                                          save_last=False,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}_2nd",
                                          mode='max'
                                          )
    early_stop_callback = EarlyStopping(
        monitor='valid_score',
        mode="max", 
        patience=config.PATIENCE,
        verbose=True
        )
    callbacks_to_use = [checkpoint_callback,
                        # early_stop_callback,
                        RichModelSummary(),
                        RichProgressBar(),
                       ]

    # == init trainer ==
    trainer = Trainer(
        max_epochs=config.EPOCHS,
        callbacks=callbacks_to_use,
        accelerator=device,
        devices=-1,  # 全ての利用可能なGPUを使用
        deterministic=False,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
        logger=TensorBoardLogger('lightning_logs', name=f'exp{exp_no}_fold{fold_id}_2nd'),
    )

    # == Training ==
    trainer.fit(model, datamodule=datamodule)
    weights = torch.load(checkpoint_callback.best_model_path)['state_dict']
        
    model.load_state_dict(weights)
    
    valid_df = datamodule.valid_df
    
    preds_oof = predict_in_batches(model, valid_df, 
                                    df_bb1_rdkit,df_bb2_rdkit, df_bb3_rdkit, df_bb1_scf_rdkit,
                                    df_bb1_ecfp4,df_bb2_ecfp4, df_bb3_ecfp4, df_bb1_scf_ecfp4,
                                   mode='valid')
    y_oof = valid_df[TARGETS].values
    
    score_BRD4, score_HSA, score_sEH, score = calc_score(preds_oof, y_oof)
    
    valid_df[[f'{target}_pred' for target in TARGETS]] = preds_oof
    
    print(f'fold:{fold_id} | CV score = {score}')
    
    df_test_temp = df_test.drop(['id'], axis=1)
    preds_test = predict_in_batches(model, df_test_temp, 
                                      df_test_bb1_rdkit,df_test_bb2_rdkit, df_test_bb3_rdkit, df_test_bb1_scf_rdkit,
                                    df_test_bb1_ecfp4,df_test_bb2_ecfp4, df_test_bb3_ecfp4, df_test_bb1_scf_ecfp4,
                                    mode='test')
    
    del model, datamodule, trainer, preds_oof, y_oof
    gc.collect()
    
    score_dict = {
        'BRD4':score_BRD4,
        "HSA":score_HSA,
        "sEH":score_sEH,
        "all":score
    }
    
    return preds_test, score_dict, valid_df

In [None]:
all_preds = []
score_list = []
score_list_BRD4 = []
score_list_HSA = []
score_list_sEH = []

def save_list_by_text(score_list, filename):
    # ファイルに書き込み
    score_list_txt = [str(loss) for loss in score_list]
    with open(paths.OUTPUT_DIR / f'{filename}.txt', 'w') as file:
        file.write(', '.join(score_list_txt))
    

for fold_id in range(config.NUM_FOLDS):
    
    preds_test, score_dict, df_oof = run_training_with_pseudolabel(fold_id, df_train)
    
    # save score
    score_list_BRD4.append(score_dict['BRD4'])
    score_list_HSA.append(score_dict['HSA'])
    score_list_sEH.append(score_dict['sEH'])
    score_list.append(score_dict['all'])
    
    save_list_by_text(score_list, 'cv_all_2nd')
    save_list_by_text(score_list_BRD4, 'cv_BRD4_2nd')
    save_list_by_text(score_list_HSA, 'cv_HSA_2nd')
    save_list_by_text(score_list_sEH, 'cv_sEH_2nd')
    
    # save preds（foldごと）
    all_preds.append(preds_test) 
    
    df_oof.to_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}_2nd.parquet")
    
    del df_oof
    gc.collect()
    

df_oof_all = pd.DataFrame()
for fold_id in range(config.NUM_FOLDS):
    df_temp = pd.read_parquet(paths.OUTPUT_DIR / f"oof_fold_{fold_id}_2nd.parquet")
    df_oof_all = pd.concat([df_oof_all, df_temp], axis=0)

df_oof_all.to_parquet(paths.OUTPUT_DIR / f"oof_all.parquet")

# **Submission**

In [None]:
preds = np.mean(all_preds, 0)

df_test['binds'] = 0
df_test.loc[df_test['protein_name']=='BRD4', 'binds'] = preds[df_test['protein_name']=='BRD4', 0]
df_test.loc[df_test['protein_name']=='HSA', 'binds'] = preds[df_test['protein_name']=='HSA', 1]
df_test.loc[df_test['protein_name']=='sEH', 'binds'] = preds[df_test['protein_name']=='sEH', 2]
df_test[['id', 'binds']].to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_2nd.csv', index = False)


In [None]:

# split sharedbb, nonsharedbb
df_sub = pd.read_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_2nd.csv')

# load parquet dict data
with open(paths.DATA_DIR / 'my-data/test_id_dict.p', 'rb') as file:
    test_id_dict = pickle.load(file)
    
df_shared = df_sub.copy()
df_non_shared = df_sub.copy()

df_shared.loc[~df_shared['id'].isin(test_id_dict['shared_bb']), 'binds'] = 0
df_non_shared.loc[~df_shared['id'].isin(test_id_dict['non_shared_bb']), 'binds'] = 0

df_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_shared_bb_2nd.csv', index = False)
df_non_shared.to_csv(paths.OUTPUT_DIR / f'exp{exp_no}_submission_non_shared_bb_2nd.csv', index = False)

In [None]:
# 古いckpt pathを削除
for fold in range(0, 5): 
    del_old_ckpt_path(fold, paths.MODEL_WEIGHTS_DIR)
    
    oof_path = paths.OUTPUT_DIR / f'oof_fold_{fold}.parquet'
    oof_path.unlink()