In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
CODE_PATH = os.getcwd()
BASE_PATH =  os.path.dirname(CODE_PATH) + '/'
print(BASE_PATH)
DATA_PATH = BASE_PATH + 'data/'

In [None]:
train_df = pd.read_csv(DATA_PATH + 'train_df.csv')
test_df = pd.read_csv(DATA_PATH + 'test_df.csv')
train_df.head(1)

In [None]:
X = train_df.iloc[:, :11]
y = train_df.iloc[:, 11:12]
X.head(1)
# y.head(1)

In [None]:
# LightGBMで学習
from   sklearn.metrics         import accuracy_score, roc_auc_score
from   sklearn.model_selection import KFold
import wandb
from wandb.lightgbm import wandb_callback, log_summary
import lightgbm as lgb
import numpy    as np
import optuna
# 警告を非表示
import warnings
warnings.simplefilter('ignore')
#wandb.login()

In [None]:
# パラメータ
FOLD             = 5              # 交差検証の分ける回数
NUM_ROUND        = 30000          # 学習ステップ数
VERBOSE_EVAL     = 5000           # 学習結果の表示ステップ数
SEED             = 42             # ランダム値のシード（再現性を持たせるため）
# categorical_list = ['Gender_enc'] # カテゴリ変数



In [None]:
class EarlyStoppingVerboseCallback:
    def __init__(self, stopping_rounds, verbose=True):
        self.stopping_rounds = stopping_rounds
        self.verbose = verbose
        self.best_score = float('inf')
        self.best_iteration = None
        self.counter = 0
        
    def __call__(self, env):
        # Retrieve the current evaluation result
        current_score = env.evaluation_result_list[-1][2]
        
        # Update the best score and iteration if current score is better
        if current_score < self.best_score:
            self.best_score = current_score
            self.best_iteration = env.iteration
            self.counter = 0
        else:
            self.counter += 1
        
        # Print the evaluation result (mimic verbose_eval)
        if self.verbose:
            print(f'[{env.iteration}] {env.evaluation_result_list[-1][0]}: {env.evaluation_result_list[-1][1]}: {current_score}')
        
        # Stop training if the stopping criterion is met
        if self.counter >= self.stopping_rounds:
            env.model.stop_training = True
            print(f'Early stopping, best iteration is: {self.best_iteration}')

In [None]:
def binary_accuracy_for_lgbm(
    preds: np.ndarray, data: lgb.Dataset, threshold: float=0.5,
    ):
    """Calculate Binary Accuracy"""
    label = data.get_label()
    weight = data.get_weight()
    pred_label = (preds > threshold).astype(int)
    acc = np.average(label == pred_label, weights=weight)
    # # eval_name, eval_result, is_higher_better
    return 'my_bin_acc', acc, True

In [None]:
def objective(trial):
    # LightGBMパラメータチューニング（Optunaで探索）
    params = {
      'objective'       : 'binary',
      'boosting_type'   : trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),  # default = gbdt
      'num_leaves'      : trial.suggest_int('num_leaves', 10, 100),                             # default = 31
      'max_depth'       : -1,      # default = -1(上限なし)
      'learning_rate'   : trial.suggest_loguniform('learning_rate', 1e-8, 1.0),                  # default = 0.1
      'feature_fraction': 0.8,     # default = 1.0
      'bagging_freq'    : 0,       # default = 0
      'random_state'    : SEED,    # default = None
      'metric'          : trial.suggest_categorical('metrics', ['binary_logloss', 'rmse', 'auc']),  # default = 'binary_logloss',
      #'device_type': 'cuda',  # Use GPU
      # 'gpu_platform_id': 0,  # Platform ID, change if necessary
      # 'gpu_device_id': 0,  # Device ID, change if necessary
    }

    valid_auc    = []
    valid_acc    = []
    models       = []
    result_data  = {}

    # kFold交差検定で決定係数を算出し、各セットの平均値を返す
    kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
    for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
      # 指定したindexで学習・評価データを分ける
      X_train, X_valid = X.iloc[train_indices], X.iloc[valid_indices] 
      y_train, y_valid = y.iloc[train_indices], y.iloc[valid_indices] 

      train_data = lgb.Dataset(X_train, y_train) 
      valid_data = lgb.Dataset(X_valid, y_valid)

      early_stopping_verbose_callback = EarlyStoppingVerboseCallback(stopping_rounds=10)
      model = lgb.train(
          params = params,
          train_set             = train_data,
          valid_sets            = [train_data, valid_data],
          # categorical_feature   = categorical_list,         # カテゴリ値のカラムを指定(やらんでも動く)
          num_boost_round       = NUM_ROUND,
          callbacks=[early_stopping_verbose_callback, lgb.log_evaluation(1)],
          feval                 = binary_accuracy_for_lgbm,
      )

      # 学習したモデルでバリデーションデータを予測
      y_valid_pred = model.predict(X_valid)

      # aucを計算（本問題の運営側 評価方法）
      auc = roc_auc_score(y_valid.to_numpy().squeeze(), y_valid_pred) # 引数：正解データ & 予測データ
      valid_auc.append(auc)

      # 正解率を計算
      acc = accuracy_score(y_valid.to_numpy().squeeze(),np.round(y_valid_pred)) # 引数：正解データ & 予測データ(四捨五入（銀行丸めになっている点は注意)）
      valid_acc.append(acc)

    # 交差検証の正解率の平均 accを最大化
    cv_acc = np.mean(valid_acc)
    cv_auc = np.mean(valid_auc)
    print('Accuracy: {}, auc: {}'.format(cv_acc, cv_auc))
    return cv_acc

# Optunaでハイパーパラメータ探索
study = optuna.create_study(direction='maximize') # 今回は正解率（Accuracy）を最大化（本当はAUC最大化の方が良い）
study.optimize(objective, n_trials=10)            # 試行回数10回

In [None]:
# Optunaで探索したベストハイパラでLightGBMを再学習
params = {
    'objective'       : 'binary',
    'boosting_type'   : study.best_params['boosting_type'], # Optunaで探索した値を指定
    'num_leaves'      : study.best_params['num_leaves'],    # Optunaで探索した値を指定
    'max_depth'       : -1,                                 # default = -1(上限なし)
    'learning_rate   ': study.best_params['learning_rate'], # Optunaで探索した値を指定
    'feature_fraction': 0.8,                                # default = 1.0
    'bagging_freq'    : 1,                                  # default = 0
    'random_state'    : 0,                                  # default = None
    'metric'          : study.best_params['metrics'],       # Optunaで探索した値を指定
    'seed'            : SEED
}
valid_scores = []
valid_acc    = []
models       = []
result_data  = {}
# kFold交差検定で決定係数を算出し、各セットの平均値を返す
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
    # 指定したindexで学習・評価データを分ける
    X_train, X_valid = X.iloc[train_indices], X.iloc[valid_indices] 
    y_train, y_valid = y.iloc[train_indices], y.iloc[valid_indices] 

    train_data = lgb.Dataset(X_train, y_train) 
    valid_data = lgb.Dataset(X_valid, y_valid)

    model = lgb.train(
        params = params,
        train_set             = train_data,
        valid_sets            = [train_data, valid_data],
        #categorical_feature   = categorical_list,         # カテゴリ値のカラムを指定(やらんでも動く)
        num_boost_round       = NUM_ROUND,
        callbacks             =[lgb.early_stopping( stopping_rounds=10, 
                                                    verbose=True), # early_stopping用コールバック関数
                                lgb.log_evaluation(VERBOSE_EVAL),
                                #wandb_callback()
                                ], # コマンドライン出力用コールバック関数
        feval                 = binary_accuracy_for_lgbm, # 評価用関数
    )

    # 学習したモデルでバリデーションデータを予測
    y_valid_pred = model.predict(X_valid)

    # aucを計算（本問題の運営側 評価方法）
    auc = roc_auc_score(y_valid.to_numpy().squeeze(), y_valid_pred) # 引数：正解データ & 予測データ
    valid_auc.append(auc)

    # 正解率を計算
    acc = accuracy_score(y_valid.to_numpy().squeeze(),np.round(y_valid_pred)) # 引数：正解データ & 予測データ(四捨五入（銀行丸めになっている点は注意)）
    valid_acc.append(acc)
    print('fold {} Accuracy:{}, auc:{}'.format(fold, acc, auc))

    # モデルを保存
    models.append(model)

# 交差検証の正解率の平均
cv_acc = np.mean(valid_acc)
cv_auc = np.mean(valid_auc)
print('Accuracy: {}, auc: {}'.format(cv_acc, cv_auc))


In [None]:
# Kfoldで学習したモデルすべてで予測
test_y_preds = []
for model in models:
  test_y_pred = model.predict(test_df)
  test_y_preds.append(test_y_pred)

test_prediction = np.mean(test_y_preds, axis=0)

binary_prediction = (test_prediction >= 0.5).astype(int)
binary_pred_df = pd.DataFrame(binary_prediction)
binary_pred_df.to_csv('./binary_submit.csv', header=False)

In [None]:
# def train():
#     with wandb.init(job_type="sweep") as run:
#         params = {
#             'objective'              : 'binary',
#             'boosting_type'          : wandb.config.boosting_type,
#             'num_leaves'             : wandb.config.num_leaves,
#             'max_depth'              : wandb.config.max_depth,
#             'learning_rate'          : wandb.config.learning_rate,
#             'feature_fraction'       : wandb.config.feature_fraction,
#             # 'bagging_fraction'       : wandb.config.bagging_fraction,
#             # 'bagging_freq'           : wandb.config.bagging_freq,
#             # 'lambda_l1'              : wandb.config.lambda_l1,
#             # 'lambda_l2'              : wandb.config.lambda_l2,
#             # 'min_data_in_leaf'       : wandb.config.min_data_in_leaf,
#             # 'min_sum_hessian_in_leaf': wandb.config.min_sum_hessian_in_leaf,
#             # 'cat_smooth'             : wandb.config.cat_smooth,
#             'random_state'           : SEED,
#             'metric'                 : wandb.config.metric,
#             'verbose'                : -1  # Avoid warnings of `No further splits with positive gain, best gain: -inf`
#         }
#         # kFold交差検定で決定係数を算出し、各セットの平均値を返す
#         kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)
#         for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
#             # 指定したindexで学習・評価データを分ける
#             X_train, X_valid = X.iloc[train_indices], X.iloc[valid_indices] 
#             y_train, y_valid = y.iloc[train_indices], y.iloc[valid_indices] 

#             train_data = lgb.Dataset(X_train, y_train) 
#             valid_data = lgb.Dataset(X_valid, y_valid)

#             model = lgb.train(
#                 params = params,
#                 train_set             = train_data,
#                 valid_sets            = [train_data, valid_data],
#                 #categorical_feature   = categorical_list,         # カテゴリ値のカラムを指定(やらんでも動く)
#                 num_boost_round       = NUM_ROUND,
#                 callbacks             =[lgb.early_stopping( stopping_rounds=10, 
#                                                             verbose=True), # early_stopping用コールバック関数
#                                         lgb.log_evaluation(VERBOSE_EVAL),
#                                         wandb_callback()
#                                         ], # コマンドライン出力用コールバック関数
#                 feval                 = binary_accuracy_for_lgbm, # 評価用関数
#             )

#             # 学習したモデルでバリデーションデータを予測
#             y_valid_pred = model.predict(X_valid)

#             # aucを計算（本問題の運営側 評価方法）
#             auc = roc_auc_score(y_valid.to_numpy().squeeze(), y_valid_pred) # 引数：正解データ & 予測データ
#             valid_auc.append(auc)

#             # 正解率を計算
#             acc = accuracy_score(y_valid.to_numpy().squeeze(),np.round(y_valid_pred)) # 引数：正解データ & 予測データ(四捨五入（銀行丸めになっている点は注意)）
#             valid_acc.append(acc)

#             ## 交差検証の正解率の平均
#             run.summary["cv_valid_acc"] = np.mean(valid_acc)
#             run.summary["cv_valid_auc"] = np.mean(valid_auc)


In [None]:
import yaml
file = CODE_PATH
with open('lightGBMparams.yaml', 'r') as file:
    sweep_config = yaml.safe_load(file)

In [None]:
#sweep_id = wandb.sweep(sweep_config, project="signate-LiverDisease")

#### Sweep

In [None]:
wandb.agent(sweep_id, function=train
            #, count=10
            )

#### finish

In [None]:
wandb.finish()