# Leash Bio

- positive data多めに使う。残りはrandom sampling
- bbごとにsoft labelingしてみる


## ref
- https://www.kaggle.com/code/yyyu54/pytorch-version-belka-1dcnn-starter-with-all-data
- https://www.kaggle.com/code/ahmedelfazouan/belka-1dcnn-starter-with-all-data/notebook

In [2]:
exp_no = '000'
DEBUG = False
# data_ratio = 1/5

In [3]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
from sklearn.metrics import average_precision_score as APS
from sklearn.model_selection import StratifiedKFold

import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

from pytorch_lightning import LightningModule
from pytorch_lightning import LightningDataModule, Trainer
# seed_everything
from pytorch_lightning.callbacks import (
    ModelCheckpoint, 
    EarlyStopping,
    ModelCheckpoint,
    RichModelSummary,
    RichProgressBar,
)
from pytorch_lightning.loggers import TensorBoardLogger
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import timm
from timm.utils import ModelEmaV2

from funcs.utils import find_latest_ckpt_path, del_old_ckpt_path
from funcs.calc_descriptor import calc_rdkit_descriptors, calc_ecfp4_descriptors
from funcs.tokenize import tokenize_smiles

import warnings
warnings.simplefilter('ignore')

In [4]:
import os
from pathlib import Path

def is_kaggle_kernel():
    return os.path.exists('/kaggle/working')

if is_kaggle_kernel():

    BASE_DIR = Path("/kaggle")
    DATA_DIR = BASE_DIR / "input"
    OUTPUT_DIR = BASE_DIR / "working"
    print('on kaggle notebook')

else:
    BASE_DIR = Path(os.getcwd()) / './../'
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / f"output/exp{exp_no}"
    
# set device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():    
    device = "cuda"
else:
    device = "cpu"
    
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print('Using', torch.cuda.device_count(), 'GPU(s)')
print('pytorch:', torch.__version__)

Using 0 GPU(s)
pytorch: 2.3.1


In [5]:
class config:
    SEED = 2024
    
    PREPROCESS = False
    EPOCHS = 20 #20
    PATIENCE = 5 #20
    BATCH_SIZE = 4096
    NUM_WORKERS = 16
    
    USE_EMA = False
    
    LR = 1e-3
    WEIGHT_DECAY = 1e-6
    MIXED_PRECISION = True
    
    NUM_FOLDS = 5    
    USE_NUM_FOLD = 1
    
class paths:    
    DATA_DIR = DATA_DIR
    OUTPUT_DIR = OUTPUT_DIR
    MODEL_WEIGHTS_DIR = OUTPUT_DIR / f"bio-models-exp{exp_no}"
    
    SHRUNKEN_DATA_DIR = DATA_DIR / "shrunken-data"

    TRAIN_PATH = SHRUNKEN_DATA_DIR / "train.parquet"
    TEST_PATH = SHRUNKEN_DATA_DIR / "test.parquet"
    SUB_PATH = SHRUNKEN_DATA_DIR / "sub.parquet"
    
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [6]:
print('fix seed')

def my_seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
# seed_everything(config.SEED, workers=True)
my_seed_everything(config.SEED)

fix seed


# **Loda Data**

In [7]:
bb_cols = ['buildingblock1_smiles', 'buildingblock2_smiles','buildingblock3_smiles', 
           'buildingblock1_smiles_scaffold', "buildingblock2_smiles_scaffold", "buildingblock3_smiles_scaffold",
           'fold']

TARGETS = ['binds_BRD4', 'binds_HSA','binds_sEH']

df_train = pd.read_parquet(paths.TRAIN_PATH, columns=bb_cols + TARGETS)

In [78]:
# ソフトラベリングを用意する
bb1_mean = df_train.groupby('buildingblock1_smiles')[TARGETS].mean()
bb2_mean = df_train.groupby('buildingblock2_smiles')[TARGETS].mean()
bb3_mean = df_train.groupby('buildingblock3_smiles')[TARGETS].mean()

bb2_scf_mean = df_train.groupby('buildingblock2_smiles_scaffold')[TARGETS].mean()
bb3_scf_mean = df_train.groupby('buildingblock3_smiles_scaffold')[TARGETS].mean()

In [79]:
# 変換用辞書を読み込む
with open(paths.SHRUNKEN_DATA_DIR / 'bb1_smiles2idx.pickle', mode='rb') as f:
    bb1_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb23_smiles2idx.pickle', mode='rb') as f:
    bb23_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb1_scaffold_smiles2idx.pickle', mode='rb') as f:
    bb1_scaffold_smiles2idx = pickle.load(f)
with open(paths.SHRUNKEN_DATA_DIR / 'bb23_scaffold_smiles2idx.pickle', mode='rb') as f:
    bb23_scaffold_smiles2idx = pickle.load(f)
    
bb1_idx2smiles = {v:k for k,v in bb1_smiles2idx.items()}
bb23_idx2smiles = {v:k for k,v in bb23_smiles2idx.items()}
bb1_scaffold_idx2smiles = {v:k for k,v in bb1_scaffold_smiles2idx.items()}
bb23_scaffold_idx2smiles = {v:k for k,v in bb23_scaffold_smiles2idx.items()}

In [10]:
# すべて0のbbがあるかチェック

In [86]:
bb2_index_list = {}
bb3_index_list = {}
bb2_scf_index_list = {}
bb3_scf_index_list = {}


def check_zero_index_list(bb_mean, col_name):
    bb_index_list = {}
    for target in TARGETS:
        index_list_temp = bb_mean[bb_mean[target]==0].index.tolist()
        bb_index_list[target] = index_list_temp

        for index in index_list_temp:
            count = (df_train[col_name]==index).sum()
            print(target, ":", index, ":", count)
            
    return bb_index_list
            
print('##### bb2 #####')
bb2_index_list = check_zero_index_list(bb2_mean, 'buildingblock2_smiles')
print('##### bb3 #####')
bb3_index_list = check_zero_index_list(bb3_mean, 'buildingblock3_smiles')
print('##### bb2 scf #####')
bb2_scf_index_list = check_zero_index_list(bb2_scf_mean, 'buildingblock2_smiles_scaffold')
print('##### bb3 scf #####')
bb3_scf_index_list = check_zero_index_list(bb3_scf_mean, 'buildingblock3_smiles_scaffold')

##### bb2 #####
binds_HSA : 1469 : 76917
##### bb3 #####
binds_BRD4 : 302 : 5674
binds_BRD4 : 387 : 10265
binds_BRD4 : 699 : 11345
binds_BRD4 : 799 : 23962
binds_BRD4 : 883 : 2162
binds_BRD4 : 928 : 1895
binds_BRD4 : 1028 : 792
binds_BRD4 : 1160 : 7295
binds_BRD4 : 1207 : 7566
binds_BRD4 : 1226 : 266
binds_BRD4 : 1301 : 6438
binds_BRD4 : 1311 : 14006
binds_BRD4 : 1331 : 30768
binds_BRD4 : 1377 : 1625
binds_BRD4 : 1379 : 8087
binds_BRD4 : 1411 : 995
binds_BRD4 : 1414 : 16501
binds_BRD4 : 1458 : 5134
binds_HSA : 70 : 43798
binds_HSA : 228 : 542
binds_HSA : 1226 : 266
binds_sEH : 1226 : 266
##### bb2 scf #####
binds_HSA : 458 : 76917
##### bb3 scf #####
binds_BRD4 : 48 : 7566
binds_BRD4 : 71 : 16501
binds_BRD4 : 116 : 2162
binds_BRD4 : 170 : 792
binds_BRD4 : 197 : 14006
binds_HSA : 305 : 542


# SUBをいじる

In [38]:
df_test = pd.read_parquet(paths.SUB_PATH)

In [103]:
# 全て0のbbが使用されているレコードidを取得
def get_zero_pred_ids(bb_index_list, col_name):
    zero_pred_ids = []
    for target in TARGETS:
        index_list = bb_index_list[target]
        for index in index_list:
            zero_pred_ids += df_test.loc[(df_test[col_name] == index)&(df_test['protein_name'] == target.replace('binds_', '')), 'id'].values.tolist()
    return zero_pred_ids

zero_pred_ids = []

zero_pred_ids += get_zero_pred_ids(bb2_index_list, 'buildingblock2_smiles')
zero_pred_ids += get_zero_pred_ids(bb3_index_list, 'buildingblock3_smiles')
zero_pred_ids += get_zero_pred_ids(bb2_scf_index_list, 'buildingblock2_smiles_scaffold')
zero_pred_ids += get_zero_pred_ids(bb3_scf_index_list, 'buildingblock3_smiles_scaffold')

zero_pred_ids = list(set(zero_pred_ids))

In [73]:
filename = "submission_015_016_031_034_037_043_044_054_056"
path = BASE_DIR / f'output/ensemble/{filename}.csv'
df_sub = pd.read_csv(path)

In [74]:
df_sub.loc[df_sub['id'].isin(zero_pred_ids), 'binds'] = 0

In [75]:
df_sub.to_csv(BASE_DIR / f'output/ensemble/{filename}_pp.csv', index=False)