In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
import lightgbm as lgb
from tqdm.auto import tqdm

In [None]:
DATA_ROOT   = Path('./data')
WORKING_DIR = Path('./')
STUDY_PATH = WORKING_DIR / 'studies'

In [None]:
df_train = pd.read_parquet(DATA_ROOT / 'working_dataset.parquet')

In [None]:
n_splits = 5
cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=False)

X = ...
y = ...

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        try:
            y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        except AttributeError:
            y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [None]:
def train(params):
    fitted_models = []
    cv_scores = []
    oof_preds = pd.DataFrame({'score': pd.Series(dtype='float'),
                              'week': pd.Series(dtype='int'),
                              'target': pd.Series(dtype='float')})

    for idx_train, idx_valid in tqdm(cv.split(X, y), total=n_splits):
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            callbacks=[lgb.log_evaluation(200), lgb.early_stopping(60)]
            # callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        fitted_models.append(model)
    
        y_pred_valid = model.predict_proba(X_valid)[:, 1]
        
        oof_preds = pd.concat([oof_preds, 
                               pd.DataFrame({'score': y_pred_valid, 'week': weeks.iloc[idx_valid], 'target': y_valid.astype(float)})])
        
        auc_score = roc_auc_score(y_valid, y_pred_valid)
        cv_scores.append(auc_score)
    
    model = VotingModel(fitted_models)
    oof_preds.to_parquet('./oofs/lightgbm.parquet')

    return model, cv_scores, oof_preds

In [None]:
import joblib

# Create or load the study
def create_or_load_study(study_name, storage=None):
    try:
        study = joblib.load(f"{STUDY_PATH}/{study_name}.pkl")
        print(f"Loaded study '{study_name}' from file.")
    except FileNotFoundError:
        study = optuna.create_study(study_name=study_name, storage=storage, direction='maximize')
        print(f"Created new study '{study_name}'.")
    return study

# Function to save the study
def save_study(study, study_name):
    joblib.dump(study, f"{STUDY_PATH}/{study_name}.pkl")
    print(f"Study '{study_name}' saved to file.")