In [None]:
import logging
import os
from pathlib import Path
import numpy as np
import imker
import nltk
import pandas as pd
import wandb
import hydra
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
from hydra.utils import instantiate
from kaggle import KaggleApi
from lightning import seed_everything
from sklearn.metrics import roc_auc_score, log_loss
from src.utils.metrics import opt_acc_score, opt_f1_score
from src.utils.kaggle_utils import download_kaggle_competition_dataset, download_kaggle_datasets
from collections import defaultdict
from typing import DefaultDict
from imker.types import ArrayLike

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=004-tabular").split(",")
WANDB_KEY = os.getenv("WANDB_KEY", None)  # input your wandb key as environment variable

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)

KAGGLE_CLIENT = KaggleApi()
KAGGLE_CLIENT.authenticate()

INPUT_DIR = Path(CFG.paths.input_dir)

logger.info(f"start {OVERRIDES} 🚀")
seed_everything(CFG.seed)
wandb.login(key=WANDB_KEY)

nltk.data.path.append(CFG.paths.output_dir)
nltk.download("punkt", download_dir=CFG.paths.output_dir)

os.environ["CUML_LOG_LEVEL"] = "error"

### Load Data


In [None]:
download_kaggle_competition_dataset(
    client=KAGGLE_CLIENT,
    competition=CFG.meta.competition,
    out_dir=Path(CFG.paths.input_dir),
)

download_kaggle_datasets(
    client=KAGGLE_CLIENT,
    datasets=CFG.kaggle.external_datasets,
    out_dir=INPUT_DIR,
)

In [None]:
train_df = pd.read_csv(INPUT_DIR / "thedrcat/daigt-v2-train-dataset/train_v2_drcat_02.csv")

if CFG.debug:
    train_df = train_df.sample(100, random_state=CFG.seed).reset_index(drop=True)
    if "debug" not in CFG.lightning.logger.wandb.group:
        CFG.lightning.logger.wandb.group = CFG.experiment_name + "_debug"

logger.debug(f"train shape : {train_df.shape}")
logger.debug(f"train generated label : {train_df['label'].sum()}")

### Imker


In [None]:
class Preprocessor(imker.BasePreProcessor):
    def __init__(self):
        self.extract_raw_features_task = instantiate(CFG.imker.tasks.extract_raw_features_task)
        self.text_cleansing_task = instantiate(CFG.imker.tasks.text_cleansing_task)
        self.extract_tfidf_features_task = instantiate(CFG.imker.tasks.extract_tfidf_features_task)

    def forward(self, X, y=None):
        x_raw_features = self.extract_raw_features_task(X)
        cleansed_texts = self.text_cleansing_task(X)
        x_cleansed_texts = self.extract_tfidf_features_task(cleansed_texts)

        x_out = pd.concat([x_raw_features, x_cleansed_texts], axis=1)
        y_out = y
        return x_out, y_out


class Splitter(imker.BaseSplitter):
    def __init__(self):
        self.splitter = imker.Task(
            imker.TaskConfig(
                task=hydra.utils.get_class(CFG.cv._target_),
                init_params={k: v for k, v in CFG.cv.items() if k != "_target_"},
            )
        )

    def get_n_splits(self):
        return self.splitter.get_n_splits()

    def split(self, X, y=None):
        return self.splitter(X, y)


class Classifier(imker.BaseModel):
    def __init__(self):
        self.lr = instantiate(CFG.imker.tasks.logistic_regression_task)
        self.knn = instantiate(CFG.imker.tasks.knn_classifier_task)

    def forward(self, X, y=None, proba=False):
        return {
            "lr": self.lr(X, y, proba=proba),
            "knn": self.knn(X, y, proba=proba),
        }


class Scorer(imker.BaseScorer):
    def calc_metrics(self, y_true: ArrayLike, y_pred: dict[str, ArrayLike]) -> pd.Series:
        _results: DefaultDict[str, dict] = defaultdict(dict)
        results = dict()

        for model, pred in y_pred.items():
            if np.ndim(pred) == 2:
                pred = pred[:, 1]

            for criteria in self.metrics:
                _results[model][criteria.__name__] = criteria(y_true, pred)
            results[model] = pd.Series(_results[model])
        return pd.concat(results)

In [None]:
pipe = imker.Pipeline(
    repo_dir=CFG.paths.output_dir,
    exp_name=CFG.experiment_name,
    pipeline_name=CFG.meta.competition,
)
pipe.set_preprocessor(Preprocessor)
pipe.set_splitter(Splitter)
pipe.set_model(Classifier)
pipe.set_metrics(
    metrics=[
        roc_auc_score,
        log_loss,
        opt_acc_score,
        opt_f1_score,
    ],
    scorer=Scorer,
)

In [None]:
pipe.train(X=train_df, y=train_df["label"])
val_preds = pipe.validate(X=train_df, y=train_df["label"], proba=True)

In [None]:
pipe.get_scores()