In [3]:
import os
import re
import hydra
import logging
import pandas as pd
import numpy as np
from pathlib import Path
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig

from src.experiment.utils import assign_fold_index
from sentence_transformers import SentenceTransformer
import category_encoders as ce

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
OVERRIDES: list[str] = os.getenv("OVERRIDES", "experiment=001-tabular_v01").split(",")

In [5]:
if OVERRIDES is None:
    raise ValueError("OVERRIDES is not set")

with initialize(version_base=None, config_path="../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=OVERRIDES,
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)


INPUT_DIR = Path(CFG.paths.input_dir)

### Load Data


In [6]:
train_df = pd.read_csv(INPUT_DIR / "train.csv")
test_df = pd.read_csv(INPUT_DIR / "test.csv")

### CV Split


In [7]:
kfold = hydra.utils.instantiate(CFG.cv)
train_df = assign_fold_index(train_df=train_df, kfold=kfold, y_col="health")

In [221]:
from sklearn.base import BaseEstimator, TransformerMixin


class FeatureNamesMixin:
    @property
    def feature_names(self):
        return self._feature_names

    @feature_names.setter
    def feature_names(self, value):
        self._feature_names = value


class CreatedAtTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "created_at") -> None:
        self.col = col

    @property
    def feature_names(self):
        return ["year", "month", "day"]

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        ts = pd.to_datetime(X[self.col])

        output_df = pd.DataFrame()
        output_df = output_df.assign(
            year=ts.dt.year,
            month=ts.dt.month,
            day=ts.dt.day,
        )

        return output_df[self.feature_names]


class CurbLocationTransformerV01(BaseEstimator, TransformerMixin, FeatureNamesMixin):
    def __init__(self, col: str = "curb_loc") -> None:
        self.col = col
        self.mapping = {"OnCurb": 1, "OffsetFromCurb": 0}
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        if self._feature_names is None:
            self._feature_names = output_df.columns.tolist()
        return output_df


class StreetWidthTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "steward") -> None:
        self.col = col
        self.mapping = {"1or2": 0, "3or4": 1, "4orMore": 2}
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        if self.feature_names is None:
            self.feature_names = output_df.columns.tolist()
        return output_df


class GuardsTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "guards") -> None:
        self.col = col
        self.mapping = {"Helpful": 0, "Unsure": 1, "Harmful": 2}
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        return output_df


class SidewalkTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "sidewalk") -> None:
        self.col = col
        self.mapping = {"NoDamage": 0, "Damage": 1}
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{self.col: X[self.col].map(self.mapping)})
        return output_df


class UserTypeTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "user_type") -> None:
        self.col = col
        self.mapping = {"Volunteer": 0, "TreesCount Staff": 1, "NYC Parks Staff": 2}
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        output_df = pd.DataFrame()

        user_types = X[self.col].map(self.mapping)
        output_df = output_df.assign(**{self.col: user_types, "is_volumeer": user_types == 0})
        return output_df


class ProblemsTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "problems") -> None:
        self.col = col
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def make_num_problems(self, problems: pd.Series | list[str]) -> list:
        num_problems = [len(re.split("(?=[A-Z])", problem)[1:]) if problem != "nan" else np.nan for problem in problems]
        return num_problems

    def make_problems_onehot(self, X) -> pd.DataFrame:
        df = X[[self.col]].copy()
        for index, item in df[["problems"]].fillna("Nan").iterrows():
            elements = re.split("(?=[A-Z])", item["problems"])
            for element in elements:
                if element:
                    df.at[index, element] = 1
            if "Other" in item:
                df.at[index, "Other"] = 1
        return df.drop(columns=["problems"]).fillna(0).astype(int)

    def transform(self, X, y=None):
        output_df = pd.DataFrame()
        output_df = output_df.assign(**{"num_problems": self.make_num_problems(X[self.col].fillna("nan"))})
        problems_df = self.make_problems_onehot(X)
        output_df = pd.concat([output_df, problems_df], axis=1)
        return output_df


class NtaTransformerV01(BaseEstimator, TransformerMixin):
    def __init__(self, col: str = "nta") -> None:
        self.col = col
        self.oe = ce.OrdinalEncoder()
        self.feature_names = None

    def fit(self, X, y=None):
        df = self.parse_nta(X)
        self.oe.fit(df)
        return self

    def parse_nta(self, X):
        df = X[[self.col]].copy()
        df["nta_char"] = df[self.col].str[:2]
        df["nta_num"] = df[self.col].str[2:]
        return df

    def transform(self, X, y=None):
        df = self.parse_nta(X)
        output_df = self.oe.transform(df)
        return output_df


class RawTransformer(BaseEstimator, TransformerMixin, FeatureNamesMixin):
    _feature_names = None

    def __init__(self, cols: list[str]) -> None:
        self.cols = cols
        # self._feature_names = None

    def fit(self, X, y=None):
        print("cache")
        return self

    def transform(self, X, y=None):
        output_df = X[self.cols]
        if self._feature_names is None:
            self._feature_names = output_df.columns.tolist()
        return output_df


class OrdinalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols=list[str]) -> None:
        self.cols = cols
        self.oe = ce.OrdinalEncoder()
        self.feature_names = None

    def fit(self, X, y=None):
        self.oe.fit(X[self.cols])
        print("cache")
        return self

    def transform(self, X, y=None):
        return self.oe.transform(X[self.cols])

In [222]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

In [223]:
class CustomColumnTransformer(ColumnTransformer, TransformerMixin):
    def transform(self, X):
        result = super().transform(X)
        feature_names = []
        for trans in self.transformers_:
            print(trans)
            if isinstance(trans[1], Pipeline):
                for step in trans[1].steps:
                    if isinstance(step[1], TransformerMixin):
                        feature_names.extend(step[1].feature_names)
            elif isinstance(trans[1], TransformerMixin):
                feature_names.extend(trans[1].feature_names)
        return pd.DataFrame(result, columns=feature_names)

    def fit_transform(self, X, y=None):
        result = super().fit_transform(X, y)
        feature_names = []
        for trans in self.transformers_:
            print(trans)
            if isinstance(trans[1], Pipeline):
                for step in trans[1].steps:
                    if isinstance(step[1], TransformerMixin):
                        feature_names.extend(step[1].feature_names)
            elif isinstance(trans[1], TransformerMixin):
                feature_names.extend(trans[1].feature_names)
        return pd.DataFrame(result, columns=feature_names)

In [224]:
column_transformer = CustomColumnTransformer(
    transformers=[
        ["op1", CreatedAtTransformerV01(), ["created_at"]],
        ["op2", CurbLocationTransformerV01(), ["curb_loc"]],
    ],
    remainder="drop",
)

pipe1 = Pipeline(
    steps=[
        ("column_transformer", column_transformer),
        ("raw", RawTransformer(cols=["curb_loc", "year"])),
        ("raw1", RawTransformer(cols=["curb_loc", "year"])),
        ("raw2", RawTransformer(cols=["curb_loc"])),
    ],
    memory="./cache",
)

In [225]:
pipe1.fit_transform(train_df)

cache
cache
cache


Unnamed: 0,curb_loc
0,1
1,1
2,1
3,1
4,1
...,...
19979,1
19980,1
19981,1
19982,1


In [162]:
column_transformer.fit(train_df)
column_transformer.transform(train_df)

('op1', CreatedAtTransformerV01(), ['created_at'])
('op2', CurbLocationTransformerV01(), ['curb_loc'])
('remainder', 'drop', [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
('op1', CreatedAtTransformerV01(), ['created_at'])
('op2', CurbLocationTransformerV01(), ['curb_loc'])
('remainder', 'drop', [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])


Unnamed: 0,year,month,day,curb_loc
0,2015,6,29,1
1,2016,9,21,1
2,2015,9,13,1
3,2016,5,9,1
4,2016,6,24,1
...,...,...,...,...
19979,2016,7,15,1
19980,2016,7,8,1
19981,2015,8,20,1
19982,2016,6,20,1


In [119]:
pipe1.steps[1][1].transform(train_df)

Unnamed: 0,curb_loc
0,OnCurb
1,OnCurb
2,OnCurb
3,OnCurb
4,OnCurb
...,...
19979,OnCurb
19980,OnCurb
19981,OnCurb
19982,OnCurb


In [7]:
def split_and_lower(s: str, exceptions: list) -> str:
    if s in exceptions:
        return s
    return " ".join([word.lower() for word in re.split("(?=[A-Z0-9])", s)])[1:]


def replace_other(s: str) -> str:
    parts = s.split("Other")
    # 末尾が 'Other' で終わる場合
    if s.endswith("Other"):
        return "And".join(parts[:-1]) + "AndOther"
    # 末尾が 'Other' で終わらない場合
    return "And".join(parts)


class ColumnsEmbedderV01(BaseEstimator, TransformerMixin):
    def __init__(
        self, model_name: str, max_seq_length: int = 512, batch_size: int = 16, prompt: str | None = None
    ) -> None:
        if prompt is not None:
            self.prompt = prompt
        else:
            self.prompt = self.base_prompt

        self.null_value = "[UNK]"
        self.exeptions = [self.null_value]

        self.model_name = model_name
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.model = SentenceTransformer(self.model_name)
        self.model.max_seq_length = self.max_seq_length

    @property
    def base_prompt(self):
        prompts = """
This data, recorded on {created_at}, describes a tree with a diameter at breast height (DBH) of {tree_dbh} inches, located on the {curb_loc} in {boroname}.
It is a {spc_common} ({spc_latin}), commonly found in the area of {nta_name} ({nta}). 
The tree is under the stewardship of {steward} and is protected by a {guards} guard. 
Various factors including {problems} influence its condition, and the surrounding sidewalk condition is described as {sidewalk}. 
This information was collected by a {user_type} from the local area of {zip_city}.
        """
        return prompts.replace("\n", "").replace("\t", "")

    def parse_default(self, texts: list[str] | pd.Series) -> list[str]:
        """
        - curb_loc
        - steward
        - guards
        - sidewalk
        - user_type
        - spc_common
        - boroname
        """
        texts = [split_and_lower(text, exceptions=self.exeptions) for text in texts]
        return texts

    def parse_problems(self, texts: list[str] | pd.Series) -> list[str]:
        """
        - problems
        """
        texts = [split_and_lower(replace_other(text), exceptions=self.exeptions) for text in texts]
        return texts

    def make_prompts(self, df: pd.DataFrame) -> pd.Series:
        source_df = df.fillna(self.null_value)
        output_df = pd.DataFrame()
        default_parse_targets = [
            "curb_loc",
            "steward",
            "guards",
            "sidewalk",
            "user_type",
            "spc_common",
        ]
        for col in default_parse_targets:
            output_df[col] = self.parse_default(source_df[col])

        output_df["problems"] = self.parse_problems(source_df["problems"])
        raw_cols = ["created_at", "tree_dbh", "spc_latin", "nta", "zip_city", "nta_name", "boroname"]

        for col in raw_cols:
            output_df[col] = source_df[col].astype(str)

        output_df["prompt"] = output_df.apply(lambda x: self.prompt.format(**x), axis=1)
        return output_df["prompt"].unique().tolist()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        prompts = self.make_prompts(X)
        embeddings = self.model.encode(prompts, show_progress_bar=True, batch_size=self.batch_size)
        return {k: v for k, v in zip(prompts, embeddings)}