In [None]:
import os
import hydra
import logging
import pandas as pd
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig

from src.experiment.utils import assign_fold_index

In [None]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=001-tabular_v01").split(",")

In [None]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


INPUT_DIR = Path(CFG.paths.input_dir)

### Load Data


In [None]:
train_df = pd.read_csv(INPUT_DIR / "train.csv")
test_df = pd.read_csv(INPUT_DIR / "test.csv")

### CV Split


In [None]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class CreatedAtTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "created_at") -> None:
        self.col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        ts = pd.to_datetime(X[self.col])

        output_df = pd.DataFrame()
        output_df = output_df.assign(
            year=ts.dt.year,
            month=ts.dt.month,
            day=ts.dt.day,
        )
        return output_df


class CurbLocationTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "curb_loc") -> None:
        self.col = col
        self.mapping = {"OnCurb": 1, "OffsetFromCurb": 0}

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        return output_df


class StreetWidthTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "steward") -> None:
        self.col = col
        self.mapping = {"1or2": 0, "3or4": 1, "4orMore": 2}

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        return output_df


class GuardsTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "guards") -> None:
        self.col = col
        self.mapping = {"Helpful": 0, "Unsure": 1, "Harmful": 2}

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        return output_df


class SidewalkTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "sidewalk") -> None:
        self.col = col
        self.mapping = {"NoDamage": 0, "Damage": 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        return output_df


class UserTypeTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "user_type") -> None:
        self.col = col
        self.mapping = {"Volunteer": 0, "TreesCount Staff": 1, "NYC Parks Staff": 2}

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()

        user_types = X[self.col].map(self.mapping)
        output_df = output_df.assign(**{self.col: user_types, "is_volumeer": user_types == 0})
        return output_df

In [None]:
import re


def split_and_lower(s: str, exceptions: list) -> str:
    if s in exceptions:
        return s
    return " ".join([word.lower() for word in re.split("(?=[A-Z0-9])", s)])[1:]


def replace_other(s: str) -> str:
    parts = s.split("Other")
    # 末尾が 'Other' で終わる場合
    if s.endswith("Other"):
        return "And".join(parts[:-1]) + "AndOther"
    # 末尾が 'Other' で終わらない場合
    return "And".join(parts)


class ColumnsEmbedderV01(BaseEstimator, TransformerMixin):
    def __init__(self, model_name: str, prompt: str | None = None) -> None:
        if prompt is not None:
            self.prompt = prompt
        else:
            self.prompt = self.base_prompt

        self.null_value = "[UNK]"
        self.exeptions = [self.null_value]

        self.model_name = model_name
        self

    @property
    def base_prompt(self):
        prompts = """
This data, recorded on {created_at}, describes a tree with a diameter at breast height (DBH) of {tree_dbh} inches, located on the {curb_loc} in {boroname}.
It is a {spc_common} ({spc_latin}), commonly found in the area of {nta_name} ({nta}). 
The tree is under the stewardship of {steward} and is protected by a {guards} guard. 
Various factors including {problems} influence its condition, and the surrounding sidewalk condition is described as {sidewalk}. 
This information was collected by a {user_type} from the local area of {zip_city}.
        """
        return prompts.replace("\n", "").replace("\t", "")

    def parse_default(self, texts: list[str] | pd.Series) -> list[str]:
        """
        - curb_loc
        - steward
        - guards
        - sidewalk
        - user_type
        - spc_common
        - boroname
        """
        texts = [split_and_lower(text, exceptions=self.exeptions) for text in texts]
        return texts

    def parse_problems(self, texts: list[str] | pd.Series) -> list[str]:
        """
        - problems
        """
        texts = [split_and_lower(replace_other(text), exceptions=self.exeptions) for text in texts]
        return texts

    def make_prompts(self, df: pd.DataFrame) -> pd.Series:
        source_df = df.fillna(self.null_value)
        output_df = pd.DataFrame()
        default_parse_targets = [
            "curb_loc",
            "steward",
            "guards",
            "sidewalk",
            "user_type",
            "spc_common",
        ]
        for col in default_parse_targets:
            output_df[col] = self.parse_default(source_df[col])

        output_df["problems"] = self.parse_problems(source_df["problems"])
        raw_cols = ["created_at", "tree_dbh", "spc_latin", "nta", "zip_city", "nta_name", "boroname"]

        for col in raw_cols:
            output_df[col] = source_df[col].astype(str)

        output_df["prompt"] = output_df.apply(lambda x: self.prompt.format(**x), axis=1)
        return output_df["prompt"].unique().tolist()

    def fit(self, X, y=None):
        prompts = self.make_prompts(X)

        return self

In [None]:
prompt = ColumnsEmbedderV01().make_prompts(train_df)

In [None]:
print(prompt[0])