In [7]:
import gc
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import time
from pathlib import Path
from glob import glob
from tqdm import tqdm
import warnings
from sklearn.metrics import average_precision_score as APS
warnings.filterwarnings('ignore')
from functools import partial
import optuna

In [3]:
import os
from pathlib import Path

BASE_DIR = Path(os.getcwd()) / './../'
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / f"output"

ENS_OUTPUT_DIR  = OUTPUT_DIR / "ensemble"
ENS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [9]:
true_cols = ["binds_BRD4", "binds_HSA", "binds_sEH"]
path = OUTPUT_DIR / f"exp066" / f"oof_all.parquet"
df_gt = pd.read_parquet(path, columns=true_cols).sort_index()

In [10]:
def ensemble(use_exp_list, index_list):
    
    preds = []
    pred_col = ["binds_BRD4_pred","binds_HSA_pred","binds_sEH_pred"]
    for exp in use_exp_list:
        
        path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
        df_temp = pd.read_parquet(path, columns=pred_col)        
        preds.append(df_temp.loc[index_list,:].values)
        
    y_preds = np.mean(preds, axis=0)

    # df_temp[pred_col] = preds
    
    del df_temp
    gc.collect()
    
    return y_preds

def calc_score(y_preds, y_true):
    score_BRD4 = APS(y_true[:,0], y_preds[:,0])
    score_HSA = APS(y_true[:,1], y_preds[:,1])
    score_sEH = APS(y_true[:,2], y_preds[:,2])
    score = (score_BRD4 + score_HSA + score_sEH) / 3
    
    return score

In [11]:
exp_list = [
    "062",
    "063",
    "064",
    "064_42",
    "065",
    "066",
    "067",    
    "068",    
    "069",
    "070",
    "070_717",
    "071",
    # "072",#ボツ
    "072_524",
    "073",
    "073_2",
    "074",
    # "076",#ボツ
    "077_1024",
    "078_1123",
    # "078_5", # 計算待ち
    "079_123",
    # "079_3", # 計算待ち
    "080_125",
]

# pred_col = ["binds_BRD4_pred","binds_HSA_pred",
#             # "binds_sEH_pred"
#             ]
# for exp in exp_list:
#     path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
#     df_temp = pd.read_parquet(path, columns=pred_col)
#     print(exp, len(df_temp))
    

In [14]:


result = []
for random_state in [42, 524, 717, 1024, 1123]:
    
    study = optuna.create_study(direction='maximize')
    
    index_list = df_gt.sample(int(len(df_gt)/10), random_state=random_state).index
    
    def objective(trial):
        # 20個の0または1を取る変数を生成
        variables = [trial.suggest_int(str(exp), 0, 1) for exp in exp_list]
        
        exp_array = np.array(exp_list)
        var_array = np.array(variables)
        use_exp_list = exp_array[np.where(var_array == 1)]
        # print(use_exp_list)
        
        # ensemble
        y_preds = ensemble(use_exp_list, index_list)
        
        y_true = df_gt.loc[index_list, true_cols].values
        score = calc_score(y_preds, y_true)
        
        del y_preds
        gc.collect()
        
        return score
    
    def ensemble(use_exp_list, index_list):
    
        preds = []
        pred_col = ["binds_BRD4_pred","binds_HSA_pred","binds_sEH_pred"]
        for exp in use_exp_list:
            
            path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
            df_temp = pd.read_parquet(path, columns=pred_col)        
            preds.append(df_temp.loc[index_list,:].values)
            
        y_preds = np.mean(preds, axis=0)

        # df_temp[pred_col] = preds
        
        del df_temp
        gc.collect()
        
        return y_preds
    
    # new_objective = partial(objective, index_list=index_list, exp_list=exp_list)
    # new_objective = partial(objective, index_list=index_list, exp_list=exp_list)
    
    study.optimize(objective, n_trials=10)

    # 最適な結果を表示します
    print('Best trial:')
    trial = study.best_trial
    result.append(trial.params)

    print(f'Value: {trial.value}')

    print('Best parameters:')
    for key, value in trial.params.items():
        print(f'{key}: {value}')

[I 2024-07-08 04:34:34,485] A new study created in memory with name: no-name-eec474c6-b73b-4fc1-bbff-eac47ddb55df
[I 2024-07-08 04:36:43,601] Trial 0 finished with value: 0.3856460438036468 and parameters: {'062': 1, '063': 0, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 1, '068': 1, '069': 0, '070': 1, '070_717': 1, '071': 0, '072': 1, '072_524': 1, '073': 1, '074': 1, '076': 0, '077_1024': 0, '078_1123': 1, '079_123': 1, '080_125': 1}. Best is trial 0 with value: 0.3856460438036468.
[I 2024-07-08 04:38:40,156] Trial 1 finished with value: 0.37947215916710847 and parameters: {'062': 1, '063': 1, '064': 0, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 0, '069': 1, '070': 0, '070_717': 1, '071': 1, '072': 1, '072_524': 1, '073': 1, '074': 0, '076': 1, '077_1024': 0, '078_1123': 1, '079_123': 0, '080_125': 1}. Best is trial 0 with value: 0.3856460438036468.
[I 2024-07-08 04:41:04,793] Trial 2 finished with value: 0.37260948418419165 and parameters: {'062': 1, '063': 1, '064': 1,

Best trial:
Value: 0.40782728070441854
Best parameters:
062: 1
063: 0
064: 0
064_42: 1
065: 1
066: 0
067: 0
068: 0
069: 1
070: 1
070_717: 1
071: 0
072: 0
072_524: 1
073: 0
074: 1
076: 0
077_1024: 0
078_1123: 0
079_123: 1
080_125: 1


[I 2024-07-08 04:54:34,883] Trial 0 finished with value: 0.37320651144462896 and parameters: {'062': 1, '063': 1, '064': 1, '064_42': 0, '065': 1, '066': 0, '067': 1, '068': 1, '069': 0, '070': 0, '070_717': 1, '071': 1, '072': 1, '072_524': 1, '073': 1, '074': 1, '076': 1, '077_1024': 1, '078_1123': 1, '079_123': 1, '080_125': 1}. Best is trial 0 with value: 0.37320651144462896.
[I 2024-07-08 04:56:20,629] Trial 1 finished with value: 0.3163324026330759 and parameters: {'062': 1, '063': 0, '064': 1, '064_42': 0, '065': 1, '066': 1, '067': 0, '068': 1, '069': 0, '070': 1, '070_717': 0, '071': 1, '072': 1, '072_524': 1, '073': 0, '074': 1, '076': 1, '077_1024': 0, '078_1123': 1, '079_123': 0, '080_125': 0}. Best is trial 0 with value: 0.37320651144462896.
[I 2024-07-08 04:57:37,783] Trial 2 finished with value: 0.34000119907786397 and parameters: {'062': 0, '063': 0, '064': 0, '064_42': 0, '065': 0, '066': 1, '067': 1, '068': 1, '069': 0, '070': 0, '070_717': 0, '071': 1, '072': 1, '072

Best trial:
Value: 0.3953176316731435
Best parameters:
062: 0
063: 1
064: 1
064_42: 1
065: 1
066: 0
067: 0
068: 0
069: 0
070: 1
070_717: 0
071: 0
072: 0
072_524: 0
073: 1
074: 0
076: 0
077_1024: 1
078_1123: 1
079_123: 0
080_125: 1


[I 2024-07-08 05:10:36,047] Trial 0 finished with value: 0.3631497060889766 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 1, '066': 0, '067': 0, '068': 1, '069': 0, '070': 1, '070_717': 1, '071': 1, '072': 1, '072_524': 0, '073': 1, '074': 1, '076': 1, '077_1024': 0, '078_1123': 1, '079_123': 1, '080_125': 1}. Best is trial 0 with value: 0.3631497060889766.
[I 2024-07-08 05:11:40,066] Trial 1 finished with value: 0.4237575857231463 and parameters: {'062': 0, '063': 0, '064': 0, '064_42': 1, '065': 0, '066': 1, '067': 0, '068': 1, '069': 0, '070': 0, '070_717': 1, '071': 0, '072': 0, '072_524': 1, '073': 1, '074': 0, '076': 0, '077_1024': 0, '078_1123': 0, '079_123': 1, '080_125': 0}. Best is trial 1 with value: 0.4237575857231463.
[I 2024-07-08 05:13:08,011] Trial 2 finished with value: 0.31361605328488373 and parameters: {'062': 1, '063': 1, '064': 0, '064_42': 0, '065': 1, '066': 1, '067': 1, '068': 0, '069': 0, '070': 1, '070_717': 0, '071': 1, '072': 0, '072_52

Best trial:
Value: 0.4237575857231463
Best parameters:
062: 0
063: 0
064: 0
064_42: 1
065: 0
066: 1
067: 0
068: 1
069: 0
070: 0
070_717: 1
071: 0
072: 0
072_524: 1
073: 1
074: 0
076: 0
077_1024: 0
078_1123: 0
079_123: 1
080_125: 0


[I 2024-07-08 05:25:09,154] Trial 0 finished with value: 0.37803763625362014 and parameters: {'062': 1, '063': 0, '064': 1, '064_42': 0, '065': 1, '066': 1, '067': 0, '068': 0, '069': 1, '070': 1, '070_717': 1, '071': 0, '072': 0, '072_524': 1, '073': 0, '074': 1, '076': 1, '077_1024': 1, '078_1123': 0, '079_123': 1, '080_125': 1}. Best is trial 0 with value: 0.37803763625362014.
[I 2024-07-08 05:27:09,324] Trial 1 finished with value: 0.358257249302316 and parameters: {'062': 0, '063': 0, '064': 1, '064_42': 1, '065': 1, '066': 1, '067': 1, '068': 0, '069': 1, '070': 1, '070_717': 0, '071': 1, '072': 0, '072_524': 0, '073': 1, '074': 0, '076': 1, '077_1024': 1, '078_1123': 1, '079_123': 1, '080_125': 0}. Best is trial 0 with value: 0.37803763625362014.
[I 2024-07-08 05:28:40,564] Trial 2 finished with value: 0.35461179975370066 and parameters: {'062': 0, '063': 0, '064': 0, '064_42': 1, '065': 0, '066': 0, '067': 1, '068': 0, '069': 1, '070': 0, '070_717': 0, '071': 1, '072': 1, '072_

Best trial:
Value: 0.412510259156591
Best parameters:
062: 1
063: 0
064: 1
064_42: 1
065: 1
066: 0
067: 0
068: 1
069: 1
070: 0
070_717: 1
071: 0
072: 0
072_524: 1
073: 1
074: 0
076: 0
077_1024: 1
078_1123: 1
079_123: 1
080_125: 0


[I 2024-07-08 05:43:52,143] Trial 0 finished with value: 0.30804880853013855 and parameters: {'062': 1, '063': 0, '064': 1, '064_42': 1, '065': 1, '066': 1, '067': 1, '068': 0, '069': 0, '070': 1, '070_717': 0, '071': 1, '072': 1, '072_524': 0, '073': 1, '074': 1, '076': 1, '077_1024': 0, '078_1123': 1, '079_123': 0, '080_125': 0}. Best is trial 0 with value: 0.30804880853013855.
[I 2024-07-08 05:44:59,529] Trial 1 finished with value: 0.3661438848381681 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 0, '065': 0, '066': 0, '067': 0, '068': 0, '069': 0, '070': 1, '070_717': 0, '071': 0, '072': 1, '072_524': 0, '073': 0, '074': 0, '076': 0, '077_1024': 1, '078_1123': 1, '079_123': 0, '080_125': 0}. Best is trial 1 with value: 0.3661438848381681.
[I 2024-07-08 05:47:06,536] Trial 2 finished with value: 0.3889567861093011 and parameters: {'062': 0, '063': 1, '064': 1, '064_42': 1, '065': 0, '066': 1, '067': 1, '068': 0, '069': 1, '070': 0, '070_717': 1, '071': 0, '072': 1, '072_5

Best trial:
Value: 0.3954999534043389
Best parameters:
062: 1
063: 0
064: 1
064_42: 0
065: 0
066: 0
067: 1
068: 0
069: 0
070: 1
070_717: 1
071: 1
072: 0
072_524: 0
073: 0
074: 0
076: 0
077_1024: 1
078_1123: 1
079_123: 1
080_125: 0


In [15]:
result_df = pd.DataFrame(result)
result_df.to_csv(ENS_OUTPUT_DIR / "optuna_240708_1.csv", index=False)

In [4]:
import pandas as pd
result_df = pd.read_csv(ENS_OUTPUT_DIR / "optuna_240708_1.csv")