In [None]:
import logging
import os
from collections import defaultdict
from pathlib import Path
from typing import DefaultDict

import cupy as cp
import hydra
import imker
import numpy as np
import pandas as pd
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
from hydra.utils import instantiate
from imker.types import ArrayLike

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=004-tabular").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)

INPUT_DIR = Path(CFG.paths.input_dir)

logger.info(f"start {OVERRIDES} 🚀")

### Load Data


In [None]:
test_df = pd.read_csv(INPUT_DIR / "test_essays.csv")

### Imker


In [None]:
class Preprocessor(imker.BasePreProcessor):
    def __init__(self):
        self.text_cleansing_task = instantiate(CFG.imker.tasks.text_cleansing_task)
        self.tfidf_vectorize_task_01 = instantiate(CFG.imker.tasks.tfidf_vectorize_task_01)
        self.count_vectorize_task_01 = instantiate(CFG.imker.tasks.count_vectorize_task_01)

        self.svd_decompose_task_01_tfidf_01 = instantiate(CFG.imker.tasks.svd_decompose_task_01)
        self.svd_decompose_task_01_count_01 = instantiate(CFG.imker.tasks.svd_decompose_task_01)

    @staticmethod
    def ngram_range_to_tuple(cfg):
        cfg.config.init_params = tuple(cfg.config.init_params)
        return cfg

    def to_dataframe(self, X, feature_name=""):
        return pd.DataFrame(X, columns=[f"f_{feature_name}_{i:03}" for i in range(X.shape[1])])

    def forward(self, X, y=None):
        cleansed_texts = self.text_cleansing_task(X["text"])

        # tfidf
        vecs = cp.asnumpy(self.tfidf_vectorize_task_01(pd.Series(cleansed_texts)).toarray())
        vecs = self.svd_decompose_task_01_tfidf_01(vecs)
        x_tfidf_vecs = self.to_dataframe(vecs, feature_name="tfidf_svd")

        # count
        vecs = cp.asnumpy(self.tfidf_vectorize_task_01(pd.Series(cleansed_texts)).toarray())
        vecs = self.svd_decompose_task_01_tfidf_01(vecs)
        x_count_vecs = self.to_dataframe(vecs, feature_name="tfidf_svd")

        x_out = pd.concat([x_tfidf_vecs, x_count_vecs], axis=1)
        y_out = y
        return x_out, y_out


class Splitter(imker.BaseSplitter):
    def __init__(self):
        self.splitter = imker.Task(
            imker.TaskConfig(
                task=hydra.utils.get_class(CFG.cv._target_),
                init_params={k: v for k, v in CFG.cv.items() if k != "_target_"},
            )
        )

    def get_n_splits(self):
        return self.splitter.get_n_splits()

    def split(self, X, y=None):
        return self.splitter(X, y)


class Classifier(imker.BaseModel):
    def __init__(self):
        self.knn_01 = instantiate(CFG.imker.tasks.knn_classifier_task_01)

    def forward(self, X, y=None, proba=False):
        feature_columns = [c for c in X.columns if c.startswith("f_")]
        return {
            "knn_01": self.knn_01(X[feature_columns], y, proba=proba),
        }


class Scorer(imker.BaseScorer):
    def calc_metrics(self, y_true: ArrayLike, y_pred: dict[str, ArrayLike]) -> pd.Series:
        _results: DefaultDict[str, dict] = defaultdict(dict)
        results = dict()

        for model, pred in y_pred.items():
            if np.ndim(pred) == 2:
                pred = pred[:, 1]

            for criteria in self.metrics:
                _results[model][criteria.__name__] = criteria(y_true, pred)
            results[model] = pd.Series(_results[model])
        return pd.concat(results)

In [None]:
pipe = imker.Pipeline(
    repo_dir=CFG.paths.output_dir,
    exp_name=CFG.experiment_name,
    pipeline_name=CFG.meta.competition,
    preprocessor=Preprocessor,
    splitter=Splitter,
    model=Classifier,
)

In [None]:
test_preds = pipe.inference(X_test=test_df, proba=True)
test_predictions = test_preds.knn_01[:, 1]

### Make Submission


In [None]:
submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")
submission_df["generated"] = test_predictions

submission_df.to_csv(Path(CFG.paths.submission_dir) / "submission.csv", index=False)
submission_df