In [None]:
import os
import hydra
import logging
import pandas as pd
import warnings
import sklearn
import rootutils
import re
import joblib
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
import category_encoders as ce
from xgboost import XGBModel
from sklearn.base import clone
from sklearn.utils.class_weight import compute_sample_weight

rootutils.setup_root(search_from="../", indicator=".project-root", pythonpath=True)

from src.experiment.utils import assign_fold_index, make_uid, visualize_feature_importance
from src.experiment.base import BaseFeatureExtractor
from src.experiment.runner import run_extractors
from src.experiment.feature.tabular import AggregatedFeatureExtractor
from src.experiment.metrics import macro_f1_from_proba

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=001-tabular_v01").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


warnings.filterwarnings("ignore")
INPUT_DIR = Path(CFG.paths.input_dir)

### Load Data


In [None]:
train_df = pd.read_csv(INPUT_DIR / "train.csv").drop(columns=["Unnamed: 0"])
test_df = pd.read_csv(INPUT_DIR / "test.csv").drop(columns=["Unnamed: 0"])

### CV Split


In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

test_df["fold"] = -1
train_df["data"] = "train"
test_df["data"] = "test"

### Feature Engineering


In [None]:
class CreatedAtFeatureExtractorV1(BaseFeatureExtractor):
    def transform(self, input_df):
        ts = pd.to_datetime(input_df["created_at"])

        output_df = pd.DataFrame()
        output_df = output_df.assign(
            created_at__year=ts.dt.year,
            created_at__month=ts.dt.month,
            created_at__day=ts.dt.day,
        )

        return output_df


class CurbLocationFeatureExtractorV1(BaseFeatureExtractor):
    def __init__(self):
        self.mapping = {"OnCurb": 1, "OffsetFromCurb": 0}

    def transform(self, input_df):
        output_df = pd.DataFrame({"curb_loc_binary": input_df["curb_loc"].map(self.mapping).tolist()})
        return output_df


class StreetWidthFeatureExtractorV1(BaseFeatureExtractor):
    def __init__(self):
        self.mapping = {"1or2": 0, "3or4": 1, "4orMore": 2}

    def transform(self, input_df):
        output_df = pd.DataFrame({"steward_rank": input_df["steward"].map(self.mapping).tolist()})
        return output_df


class GuardsFeatureExtractorV1(BaseFeatureExtractor):
    def __init__(self):
        self.mapping = {"Helpful": 0, "Unsure": 1, "Harmful": 2}

    def transform(self, input_df):
        output_df = pd.DataFrame({"guards_rank": input_df["guards"].map(self.mapping).tolist()})
        return output_df


class SidewalkFeatureExtractorV1(BaseFeatureExtractor):
    def __init__(self) -> None:
        self.mapping = {"NoDamage": 0, "Damage": 1}

    def transform(self, input_df):
        output_df = pd.DataFrame({"sidewalk_binary": input_df["sidewalk"].map(self.mapping).tolist()})
        return output_df


class UserTypeFeatureExtractorV1(BaseFeatureExtractor):
    def __init__(self) -> None:
        self.mapping = {"Volunteer": 0, "TreesCount Staff": 1, "NYC Parks Staff": 2}

    def transform(self, input_df):
        user_types = input_df["user_type"].map(self.mapping)

        output_df = pd.DataFrame()
        output_df["user_type_rank"] = input_df["user_type"].map(self.mapping)
        output_df["is_volunteer"] = (user_types == 0).tolist()

        return output_df


class ProblemsFeatureExtractorV1(BaseFeatureExtractor):
    def make_num_problems_df(self, problems: pd.Series | list[str]) -> list:
        num_problems = [len(re.split("(?=[A-Z])", problem)[1:]) if problem != "nan" else np.nan for problem in problems]
        return pd.DataFrame({"num_problems": num_problems})

    def make_problems_onehot_df(self, input_df) -> pd.DataFrame:
        df = input_df[["problems"]].copy()
        for index, item in df[["problems"]].fillna("Nan").iterrows():
            elements = re.split("(?=[A-Z])", item["problems"])
            for element in elements:
                if element:
                    df.at[index, element] = 1
            if "Other" in item:
                df.at[index, "Other"] = 1
        return df.drop(columns=["problems"]).fillna(0).astype(int).add_prefix("problem_is_")

    def transform(self, input_df):
        features_num_problems_df = self.make_num_problems_df(input_df["problems"].fillna("nan"))
        features_problems_onehot_df = self.make_problems_onehot_df(input_df)
        output_df = pd.concat([features_num_problems_df, features_problems_onehot_df], axis=1)
        return output_df


class NtaFeatureExtractorV1(BaseFeatureExtractor):
    def __init__(self) -> None:
        self.oe = ce.OrdinalEncoder()

    def fit(self, input_df):
        df = self.parse_nta(input_df)
        self.oe.fit(df)
        return self

    def parse_nta(self, input_df):
        df = input_df[["nta"]].copy()
        df["nta_char"] = df["nta"].str[:2]
        df["nta_num"] = df["nta"].str[2:]
        return df

    def transform(self, X, y=None):
        df = self.parse_nta(X)
        output_df = self.oe.transform(df).add_prefix("oe_")
        return output_df


class RawTransformer(BaseFeatureExtractor):
    def __init__(self, cols: list[str]) -> None:
        self.cols = cols

    def transform(self, input_df):
        output_df = input_df[self.cols].copy()
        return output_df.astype(np.float32)


class OrdinalFeatureExtractor(BaseFeatureExtractor):
    def __init__(self, cols=list[str]) -> None:
        self.cols = cols
        self.oe = ce.OrdinalEncoder()

    def fit(self, input_df):
        self.oe.fit(input_df[self.cols].astype(str))
        return self

    def transform(self, input_df):
        output_df = self.oe.transform(input_df[self.cols].astype(str)).add_prefix("oe_")
        return output_df

In [None]:
feature_extractors = [
    CreatedAtFeatureExtractorV1(),
    CurbLocationFeatureExtractorV1(),
    StreetWidthFeatureExtractorV1(),
    GuardsFeatureExtractorV1(),
    SidewalkFeatureExtractorV1(),
    UserTypeFeatureExtractorV1(),
    ProblemsFeatureExtractorV1(),
    NtaFeatureExtractorV1(),
    RawTransformer(
        cols=[
            "tree_dbh",
            "cb_num",
            "st_senate",
            "st_assem",
            "cncldist",
            "borocode",
        ]
    ),
    OrdinalFeatureExtractor(
        cols=[
            "spc_common",
            "spc_latin",
            "cncldist",
            "boro_ct",
            "boroname",
            "zip_city",
        ]
    ),
    AggregatedFeatureExtractor(
        group_keys=["created_at__year"],
        group_values=["tree_dbh"],
        agg_methods=["min", "max", "std", "mean", "median"],
        extr_agg_methods=["z-score"],
        parents=[CreatedAtFeatureExtractorV1()],
    ),
    *[
        AggregatedFeatureExtractor(
            group_keys=keys,
            group_values=["tree_dbh"],
            agg_methods=["min", "max", "std", "mean", "median"],
            extr_agg_methods=["z-score"],
        )
        for keys in [
            ["sidewalk"],
            ["problems"],
            ["zip_city"],
            ["steward"],
            ["cb_num"],
            ["boroname"],
            ["sidewalk", "spc_common"],
            ["problems", "spc_common"],
            ["zip_city", "spc_common"],
            ["cb_num", "spc_common"],
            ["boroname", "spc_common"],
            ["steward", "spc_common"],
            ["steward", "steward"],
            ["steward", "problems"],
            ["steward", "zip_city"],
            ["steward", "cb_num"],
            ["steward", "boroname"],
        ]
    ],
]

raw_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
raw_feature_df = run_extractors(
    input_df=raw_df,
    extractors=feature_extractors,
    dirpath=Path(CFG.paths.output_dir) / "features",
    fit=True,
    cache=True,
)

raw_feature_df = pd.concat([raw_df, raw_feature_df], axis=1)
train_feature_df = raw_feature_df.query("data == 'train'").reset_index(drop=True)
test_feature_df = raw_feature_df.query("data == 'test'").reset_index(drop=True)

feature_columns = [col for col in train_feature_df.columns if col.startswith("f_")]

In [None]:
def train_cv_tabular_v1(
    df: pd.DataFrame,
    estimator,
    feature_columns: list[str],
    target_columns: str,
    fit_params: dict,
    output_dir: Path,
    train_folds: list[int] | None = None,
    overwrite: bool = False,
    use_sample_weight: bool = False,
):
    """train cv for xgboost estimator"""
    estimators = []

    if train_folds is None:
        train_folds = sorted(df["fold"].unique())

    for i_fold in train_folds:
        logger.info(f"start training fold={i_fold} 🚀 ")
        fit_estimator = clone(estimator)

        output_df_fold = output_dir / f"fold{i_fold}"
        output_df_fold.mkdir(exist_ok=True, parents=True)

        estimator_uid = make_uid(fit_estimator.__dict__)
        estimator_name = f"{fit_estimator.__class__.__name__}_{estimator_uid}"
        estimator_path = output_df_fold / f"{estimator_name}.pkl"

        if estimator_path.exists() and (not overwrite):
            logger.info(f"skip fitting in fold{i_fold}")
            fit_estimator = joblib.load(estimator_path)
            estimators.append(fit_estimator)
            continue

        # split train and valid
        train_df = df.query(f"fold != {i_fold}").reset_index(drop=True)
        valid_df = df.query(f"fold == {i_fold}").reset_index(drop=True)
        tr_x, tr_y = train_df[feature_columns], train_df[target_columns]
        va_x, va_y = valid_df[feature_columns], valid_df[target_columns]

        logger.info(f"estimator : {estimator_name}")

        if use_sample_weight:
            fit_params["sample_weight"] = compute_sample_weight(class_weight="balanced", y=tr_y)
            fit_params["sample_weight_eval_set"] = [compute_sample_weight(class_weight="balanced", y=va_y)]

        fit_estimator.fit(X=tr_x, y=tr_y, eval_set=[(va_x, va_y)], **fit_params)
        estimators.append(fit_estimator)

        joblib.dump(fit_estimator, estimator_path)

    return estimators


def predict_cv_tabular_v1(
    df: pd.DataFrame,
    estimators: list,
    feature_columns: list[str],
    train_folds: list[int] | None = None,
    test: bool = False,
    result_columns: list[str] | None = None,
):
    if result_columns is None:
        result_columns = [col for col in df.columns if col not in feature_columns]

    if train_folds is None:
        train_folds = sorted(df["fold"].unique())

    def _predict_i(df, i_fold, estimator):
        logger.info(f"fold{i_fold} predict : test={test}")
        if not test:
            df = df.query(f"fold == {i_fold}").reset_index(drop=True)

        va_x = df[feature_columns]
        va_pred = estimator.predict(va_x)
        i_result_df = df[result_columns].assign(pred=va_pred.tolist())
        return i_result_df

    valid_result_df = pd.concat(
        [_predict_i(df, i_fold, estimator) for i_fold, estimator in zip(train_folds, estimators)],
        axis=0,
        ignore_index=True,
    )
    return valid_result_df

In [None]:
model_params = {
    # https://xgboost.readthedocs.io/en/stable/parameter.html#:~:text=Learning%20Task%20Parameters-,%EF%83%81,-Specify%20the%20learning
    "device": "gpu",
    "n_estimators": 10000,
    "max_depth": 8,
    "learning_rate": 0.01,
    "objective": "multi:softprob",
    "eval_metric": "auc",
    "tree_method": "hist",
    "colsample_bytree": 0.2,
    "subsample": 0.2,
    "random_state": 8823,
    "early_stopping_rounds": 100,
    "num_class": 3,
    "importance_type": "gain",
}

fit_params = {"verbose": 100}

estimator = XGBModel(**model_params)
model_output_dir = Path(CFG.paths.output_dir) / "models"

trained_estimators = train_cv_tabular_v1(
    df=train_feature_df,
    estimator=estimator,
    feature_columns=feature_columns,
    target_columns=["health"],
    fit_params=fit_params,
    output_dir=model_output_dir,
    overwrite=True,
    use_sample_weight=True,
)

valid_result_df = predict_cv_tabular_v1(
    df=train_feature_df,
    estimators=trained_estimators,
    feature_columns=feature_columns,
)

val_score = macro_f1_from_proba(y_true=valid_result_df["health"], y_pred=valid_result_df["pred"].tolist())
logger.info(f"macro f1 score: {val_score}")

In [None]:
fig, importance_df = visualize_feature_importance(
    estimators=trained_estimators,
    feature_columns=feature_columns,
    top_n=50,
)