In [70]:
import gc
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import time
from pathlib import Path
from glob import glob
from tqdm import tqdm
import warnings
from sklearn.metrics import average_precision_score as APS
warnings.filterwarnings('ignore')
from functools import partial
import optuna

In [71]:
import os
from pathlib import Path

BASE_DIR = Path(os.getcwd()) / './../'
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / f"output"

ENS_OUTPUT_DIR  = OUTPUT_DIR / "ensemble"
ENS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [72]:
true_cols = ["binds_BRD4", "binds_HSA", "binds_sEH"]
path = OUTPUT_DIR / f"exp066" / f"oof_all.parquet"
df_gt = pd.read_parquet(path, columns=true_cols).sort_index()

In [73]:
def ensemble(use_exp_list, index_list):
    
    preds = []
    pred_col = ["binds_BRD4_pred","binds_HSA_pred","binds_sEH_pred"]
    for exp in use_exp_list:
        
        path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
        df_temp = pd.read_parquet(path, columns=pred_col)        
        preds.append(df_temp.loc[index_list,:].values)
        
    y_preds = np.mean(preds, axis=0)

    # df_temp[pred_col] = preds
    
    del df_temp
    gc.collect()
    
    return y_preds

def calc_score(y_preds, y_true):
    score_BRD4 = APS(y_true[:,0], y_preds[:,0])
    score_HSA = APS(y_true[:,1], y_preds[:,1])
    score_sEH = APS(y_true[:,2], y_preds[:,2])
    score = (score_BRD4 + score_HSA + score_sEH) / 3
    
    return score

In [74]:
exp_list = [
    "062",
    "063",
    "064",
    "064_42",
    "065",
    "066",
    "067",    
    "068",    
    "069",
    "070",
    # "070_717",
    "071",
    "072",
    "072_524",
    "073",
    "074",
    # "076",
    "077_1024",
    "078_1123",
    "079_123",
    "080_125",
]

# pred_col = ["binds_BRD4_pred","binds_HSA_pred",
#             # "binds_sEH_pred"
#             ]
# for exp in exp_list:
#     path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
#     df_temp = pd.read_parquet(path, columns=pred_col)
#     print(exp, len(df_temp))
    

In [79]:
study = optuna.create_study(direction='maximize')

result = []
for random_state in [42, 524, 717, 1024, 1123]:
    
    index_list = df_gt.sample(int(len(df_gt)/10), random_state=random_state).index
    print(index_list[:10])
    
    def objective(trial):
        # 20個の0または1を取る変数を生成
        variables = [trial.suggest_int(str(exp), 0, 1) for exp in exp_list]
        
        exp_array = np.array(exp_list)
        var_array = np.array(variables)
        use_exp_list = exp_array[np.where(var_array == 1)]
        # print(use_exp_list)
        
        # ensemble
        y_preds = ensemble(use_exp_list, index_list)
        
        y_true = df_gt.loc[index_list, true_cols].values
        score = calc_score(y_preds, y_true)
        
        del y_preds
        gc.collect()
        
        return score
    
    def ensemble(use_exp_list, index_list):
    
        preds = []
        pred_col = ["binds_BRD4_pred","binds_HSA_pred","binds_sEH_pred"]
        for exp in use_exp_list:
            
            path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
            df_temp = pd.read_parquet(path, columns=pred_col)        
            preds.append(df_temp.loc[index_list,:].values)
            
        y_preds = np.mean(preds, axis=0)

        # df_temp[pred_col] = preds
        
        del df_temp
        gc.collect()
        
        return y_preds
    
    # new_objective = partial(objective, index_list=index_list, exp_list=exp_list)
    # new_objective = partial(objective, index_list=index_list, exp_list=exp_list)
    
    study.optimize(objective, n_trials=5)

    # 最適な結果を表示します
    print('Best trial:')
    trial = study.best_trial
    result.append(trial.params)

    print(f'Value: {trial.value}')

    print('Best parameters:')
    for key, value in trial.params.items():
        print(f'{key}: {value}')

[I 2024-07-08 02:08:51,989] A new study created in memory with name: no-name-80805a8f-3cd3-4e16-a7e8-5faed9d119eb


Index([38734477,  6113446, 16141361, 65251159,  5579301, 20407207,  4456174,
       57202049, 33913505,  2939869],
      dtype='int64')


[I 2024-07-08 02:10:15,141] Trial 0 finished with value: 0.36572867217816424 and parameters: {'062': 1, '063': 0, '064': 1, '064_42': 0, '065': 0, '066': 0, '067': 0, '068': 0, '069': 1, '070': 0, '071': 1, '072': 1, '072_524': 0, '073': 1, '074': 0, '077_1024': 1, '078_1123': 1, '079_123': 1, '080_125': 0}. Best is trial 0 with value: 0.36572867217816424.
[I 2024-07-08 02:11:41,143] Trial 1 finished with value: 0.3426386468472607 and parameters: {'062': 1, '063': 1, '064': 1, '064_42': 1, '065': 0, '066': 1, '067': 0, '068': 1, '069': 1, '070': 1, '071': 0, '072': 1, '072_524': 0, '073': 0, '074': 0, '077_1024': 0, '078_1123': 0, '079_123': 1, '080_125': 1}. Best is trial 0 with value: 0.36572867217816424.
[I 2024-07-08 02:13:00,732] Trial 2 finished with value: 0.3547671984836609 and parameters: {'062': 0, '063': 0, '064': 0, '064_42': 1, '065': 1, '066': 0, '067': 1, '068': 1, '069': 1, '070': 1, '071': 0, '072': 1, '072_524': 0, '073': 0, '074': 1, '077_1024': 1, '078_1123': 1, '07

Best trial:
Value: 0.36572867217816424
Best parameters:
062: 1
063: 0
064: 1
064_42: 0
065: 0
066: 0
067: 0
068: 0
069: 1
070: 0
071: 1
072: 1
072_524: 0
073: 1
074: 0
077_1024: 1
078_1123: 1
079_123: 1
080_125: 0
Index([47635583, 46943820, 74124686, 40400210,  4525261, 57349069, 74115063,
       25482895, 64196399, 62277734],
      dtype='int64')


[I 2024-07-08 02:16:59,702] Trial 5 finished with value: 0.33693805052460135 and parameters: {'062': 1, '063': 0, '064': 0, '064_42': 0, '065': 0, '066': 1, '067': 1, '068': 0, '069': 1, '070': 1, '071': 1, '072': 0, '072_524': 1, '073': 0, '074': 0, '077_1024': 0, '078_1123': 0, '079_123': 0, '080_125': 1}. Best is trial 0 with value: 0.36572867217816424.
[I 2024-07-08 02:18:17,495] Trial 6 finished with value: 0.4152117005913307 and parameters: {'062': 0, '063': 0, '064': 1, '064_42': 1, '065': 0, '066': 0, '067': 1, '068': 1, '069': 1, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 0, '074': 1, '077_1024': 1, '078_1123': 1, '079_123': 0, '080_125': 1}. Best is trial 6 with value: 0.4152117005913307.
[I 2024-07-08 02:19:30,899] Trial 7 finished with value: 0.29434238947739694 and parameters: {'062': 0, '063': 0, '064': 1, '064_42': 0, '065': 0, '066': 1, '067': 1, '068': 1, '069': 1, '070': 0, '071': 0, '072': 1, '072_524': 0, '073': 1, '074': 1, '077_1024': 1, '078_1123': 0, '07

Best trial:
Value: 0.4152117005913307
Best parameters:
062: 0
063: 0
064: 1
064_42: 1
065: 0
066: 0
067: 1
068: 1
069: 1
070: 0
071: 0
072: 0
072_524: 1
073: 0
074: 1
077_1024: 1
078_1123: 1
079_123: 0
080_125: 1
Index([93857239, 12893077, 18930551,  1570349, 22157193, 41997880, 34843164,
       72454862, 61052776, 92009712],
      dtype='int64')


[I 2024-07-08 02:23:33,689] Trial 10 finished with value: 0.42267015212135733 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 1, '078_1123': 1, '079_123': 0, '080_125': 1}. Best is trial 10 with value: 0.42267015212135733.
[I 2024-07-08 02:24:48,067] Trial 11 finished with value: 0.42267015212135733 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 1, '078_1123': 1, '079_123': 0, '080_125': 1}. Best is trial 10 with value: 0.42267015212135733.
[I 2024-07-08 02:26:00,689] Trial 12 finished with value: 0.42267015212135733 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 1, '078_1123':

Best trial:
Value: 0.42267015212135733
Best parameters:
062: 0
063: 1
064: 1
064_42: 1
065: 1
066: 0
067: 0
068: 0
069: 0
070: 0
071: 0
072: 0
072_524: 1
073: 1
074: 0
077_1024: 1
078_1123: 1
079_123: 0
080_125: 1
Index([ 3200488, 78156342, 96179489, 31764937, 24915474,    76894, 16047896,
        5451622, 86251237, 78652214],
      dtype='int64')


[I 2024-07-08 02:29:41,844] Trial 15 finished with value: 0.42520516808126657 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 1, '078_1123': 1, '079_123': 0, '080_125': 1}. Best is trial 15 with value: 0.42520516808126657.
[I 2024-07-08 02:30:55,169] Trial 16 finished with value: 0.42520516808126657 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 1, '078_1123': 1, '079_123': 0, '080_125': 1}. Best is trial 15 with value: 0.42520516808126657.
[I 2024-07-08 02:31:53,782] Trial 17 finished with value: 0.38693015114726864 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 0, '078_1123':

Best trial:
Value: 0.42520516808126657
Best parameters:
062: 0
063: 1
064: 1
064_42: 1
065: 1
066: 0
067: 0
068: 0
069: 0
070: 0
071: 0
072: 0
072_524: 1
073: 1
074: 0
077_1024: 1
078_1123: 1
079_123: 0
080_125: 1
Index([ 5962652, 95773954, 84715492, 84836951, 32495753, 55026869, 68969673,
       88362740, 50374796, 96715473],
      dtype='int64')


[I 2024-07-08 02:35:54,204] Trial 20 finished with value: 0.3689833912099681 and parameters: {'062': 1, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 1, '067': 0, '068': 0, '069': 0, '070': 0, '071': 1, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 0, '078_1123': 0, '079_123': 1, '080_125': 1}. Best is trial 15 with value: 0.42520516808126657.
[I 2024-07-08 02:37:14,087] Trial 21 finished with value: 0.42458019628941085 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 1, '078_1123': 1, '079_123': 0, '080_125': 1}. Best is trial 15 with value: 0.42520516808126657.
[I 2024-07-08 02:38:32,106] Trial 22 finished with value: 0.42458019628941085 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 0, '070': 0, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '077_1024': 1, '078_1123': 

Best trial:
Value: 0.42520516808126657
Best parameters:
062: 0
063: 1
064: 1
064_42: 1
065: 1
066: 0
067: 0
068: 0
069: 0
070: 0
071: 0
072: 0
072_524: 1
073: 1
074: 0
077_1024: 1
078_1123: 1
079_123: 0
080_125: 1


In [82]:
result_df = pd.DataFrame(result)
result_df.to_csv(ENS_OUTPUT_DIR / "optuna.csv", index=False)