In [None]:
import logging
import os
from pathlib import Path
import json
import joblib
import wandb
import pandas as pd
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
from hydra.utils import instantiate

from kaggle import KaggleApi
from lightning import seed_everything
from sklearn.model_selection import BaseCrossValidator

from src.utils.metrics import binary_classification_metrics

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=002e").split(",")
WANDB_KEY = os.getenv("WANDB_KEY", None)  # input your wandb key as environment variable

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)

KAGGLE_CLIENT = KaggleApi()
KAGGLE_CLIENT.authenticate()

INPUT_DIR = Path(CFG.paths.input_dir)
RESOURCES_DIR = Path(CFG.paths.resource_dir)


logger.info(f"start {OVERRIDES} 🚀")
seed_everything(CFG.seed)
wandb.login(key=WANDB_KEY)

In [None]:
inputs = CFG.stacking.level_0.inputs
train_df = pd.DataFrame()

for i, i_input in enumerate(inputs):
    usecols = ["text", "label", "pred"]
    if i > 0:
        usecols = ["pred"]

    df = pd.read_csv(RESOURCES_DIR / "outputs" / i_input / "valid_results.csv", usecols=usecols).rename(
        columns={"pred": f"pred_{i}"}
    )
    train_df = pd.concat([train_df, df], axis=1)

### CV Split


In [None]:
def assign_fold_index(train_df: pd.DataFrame, kfold: BaseCrossValidator) -> pd.DataFrame:
    train_df["fold"] = -1
    for fold_index, (_, valid_index) in enumerate(kfold.split(X=train_df, y=train_df["label"])):
        train_df.loc[valid_index, "fold"] = fold_index
    return train_df


kfold = instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold)
logger.debug(f"train_df.shape: {train_df.shape}")

In [None]:
no_feature_cols = ["text", "label", "fold"]  # exclude label and fold from feature_cols
feature_cols = [col for col in train_df.columns if col not in no_feature_cols]

result_dfs = []
base_output_dir = Path(CFG.paths.output_dir)  # store output_dir for later use

for i_fold in range(CFG.n_splits):
    if i_fold not in CFG.train_folds:
        continue

    i_train_df = train_df.query(f"fold != {i_fold}").reset_index(drop=True)
    i_valid_df = train_df.query(f"fold == {i_fold}").reset_index(drop=True)

    CFG.paths.output_dir = str(base_output_dir / f"fold{i_fold}")
    weights_path = Path(CFG.paths.output_dir) / "best.pkl"
    weights_path.parent.mkdir(parents=True, exist_ok=True)

    # training
    estimator = instantiate(CFG.stacking.level_0.model.estimator)
    estimator.fit(X=i_train_df[feature_cols], y=i_train_df["label"])
    joblib.dump(estimator, weights_path)

    # validation
    val_predictions = estimator.predict(X=i_valid_df[feature_cols]).clip(0, 1)

    # save dataframe assigned validation predictions
    i_result_df = i_valid_df.assign(pred=(val_predictions))
    joblib.dump(i_result_df, Path(CFG.paths.output_dir) / "val_predictions.pkl")
    result_dfs.append(i_result_df)

    # evaluate
    socres = binary_classification_metrics(y_true=i_valid_df["label"], y_pred=val_predictions)
    json.dump(socres, open(Path(CFG.paths.output_dir) / "valid_scores.json", "w"))
    logger.info(f"fold{i_fold} scores: {socres}")

CFG.paths.output_dir = str(base_output_dir)  # restore output_dir
valid_results_df = pd.concat(result_dfs, axis=0).reset_index(drop=True)
valid_results_df.to_csv(Path(CFG.paths.output_dir) / "valid_results.csv", index=False)

In [None]:
valid_results_df