In [1]:
import gc
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import time
from pathlib import Path
from glob import glob
from tqdm import tqdm
import warnings
from sklearn.metrics import average_precision_score as APS
warnings.filterwarnings('ignore')
from functools import partial
import optuna

In [2]:
import os
from pathlib import Path

BASE_DIR = Path(os.getcwd()) / './../'
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / f"output"

ENS_OUTPUT_DIR  = OUTPUT_DIR / "ensemble"
ENS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
true_cols = ["binds_BRD4", "binds_HSA", "binds_sEH"]
path = OUTPUT_DIR / f"exp066" / f"oof_all.parquet"
df_gt = pd.read_parquet(path, columns=true_cols).sort_index()

In [4]:
def ensemble(use_exp_list, index_list):
    
    preds = []
    pred_col = ["binds_BRD4_pred","binds_HSA_pred","binds_sEH_pred"]
    for exp in use_exp_list:
        
        path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
        df_temp = pd.read_parquet(path, columns=pred_col)        
        preds.append(df_temp.loc[index_list,:].values)
        
    y_preds = np.mean(preds, axis=0)

    # df_temp[pred_col] = preds
    
    del df_temp
    gc.collect()
    
    return y_preds

def calc_score(y_preds, y_true):
    score_BRD4 = APS(y_true[:,0], y_preds[:,0])
    score_HSA = APS(y_true[:,1], y_preds[:,1])
    score_sEH = APS(y_true[:,2], y_preds[:,2])
    score = (score_BRD4 + score_HSA + score_sEH) / 3
    
    return score

In [5]:
exp_list = [
    "062",
    "063",
    "064",
    "064_42",
    "065",
    "066",
    "067",    
    "068",    
    "069",
    "070",
    "070_717",
    "071",
    # "072",#ボツ
    "072_524",
    "073",
    "073_2",
    "074",
    # "076",#ボツ
    "077_1024",
    "078_1123",
    "078_5", # 計算待ち
    "079_123",
    "079_3", # 計算待ち
    "080_125",
]

pred_col = ["binds_BRD4_pred","binds_HSA_pred",
            # "binds_sEH_pred"
            ]
for exp in exp_list:
    path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
    df_temp = pd.read_parquet(path, columns=pred_col)
    print(exp, len(df_temp))
    

062 98415610
063 98415610
064 98415610
064_42 98415610
065 98415610
066 98415610
067 98415610
068 98415610
069 98415610
070 98415610
070_717 98415610
071 98415610
072_524 98415610
073 98415610
073_2 98415610
074 98415610
077_1024 98415610
078_1123 98415610
078_5 98415610
079_123 98415610
079_3 19683122
080_125 98415610


In [6]:
result = []
values = []
for random_state in [42, 524, 717, 1024, 1123]:
    
    study = optuna.create_study(direction='maximize')
    
    index_list = df_gt.sample(int(len(df_gt)/5), random_state=random_state).index
    
    def objective(trial):
        # 20個の0または1を取る変数を生成
        variables = [trial.suggest_int(str(exp), 0, 1) for exp in exp_list]
        
        exp_array = np.array(exp_list)
        var_array = np.array(variables)
        use_exp_list = exp_array[np.where(var_array == 1)]
        # print(use_exp_list)
        
        # ensemble
        y_preds = ensemble(use_exp_list, index_list)
        
        y_true = df_gt.loc[index_list, true_cols].values
        score = calc_score(y_preds, y_true)
        
        del y_preds
        gc.collect()
        
        return score
    
    def ensemble(use_exp_list, index_list):
    
        preds = []
        pred_col = ["binds_BRD4_pred","binds_HSA_pred","binds_sEH_pred"]
        for exp in use_exp_list:
            
            path = OUTPUT_DIR / f"exp{exp}" / f"oof_all.parquet"
            df_temp = pd.read_parquet(path, columns=pred_col)        
            preds.append(df_temp.loc[index_list,:].values)
            
        y_preds = np.mean(preds, axis=0)

        # df_temp[pred_col] = preds
        
        del df_temp
        gc.collect()
        
        return y_preds
    
    # new_objective = partial(objective, index_list=index_list, exp_list=exp_list)
    # new_objective = partial(objective, index_list=index_list, exp_list=exp_list)
    
    study.optimize(objective, n_trials=100)

    # 最適な結果を表示します
    print('Best trial:')
    trial = study.best_trial
    result.append(trial.params)
    values.append(trial.value)

    print(f'Value: {trial.value}')

    print('Best parameters:')
    for key, value in trial.params.items():
        print(f'{key}: {value}')
        
    result_df = pd.DataFrame(result)
    result_df['score'] = values
    i = 5
    result_df.to_csv(ENS_OUTPUT_DIR / f"optuna_240708_{i}.csv", index=False)

[I 2024-07-08 15:55:47,088] A new study created in memory with name: no-name-24152f72-8b98-4a78-8c23-99d8d8cdebd7
[W 2024-07-08 15:56:22,051] Trial 0 failed with parameters: {'062': 0, '063': 0, '064': 1, '064_42': 1, '065': 0, '066': 0, '067': 1, '068': 0, '069': 0, '070': 0, '070_717': 1, '071': 0, '072_524': 0, '073': 0, '073_2': 1, '074': 1, '077_1024': 1, '078_1123': 1, '078_5': 0, '079_123': 0, '079_3': 0, '080_125': 1} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_449/254986739.py", line 19, in objective
    y_preds = ensemble(use_exp_list, index_list)
  File "/tmp/ipykernel_449/254986739.py", line 37, in ensemble
    preds.append(df_temp.loc[index_list,:].values)
  File "/usr/local/lib/python3.10/site-packages/pandas/core/indexing.py", line 1147, in __getitem__
    return self._geti

KeyboardInterrupt: 

In [None]:
result_df = pd.DataFrame(result)
result_df.to_csv(ENS_OUTPUT_DIR / "optuna_240708_1024.csv", index=False)

In [None]:
import pandas as pd
result_df = pd.read_csv(ENS_OUTPUT_DIR / "optuna_240708_1024.csv")