In [1]:
import polars as pl
from pathlib import Path


dataset_path = Path("../dataset/")
unnecessary_columns = ["posting_date", "id"]

train_df = pl.read_csv(dataset_path / "projectA_vehicle_train.csv").drop(
    unnecessary_columns
)
val_df = pl.read_csv(dataset_path / "projectA_vehicle_val.csv").drop(
    unnecessary_columns
)
test_df = pl.read_csv(dataset_path / "projectA_vehicle_test.csv").drop(
    unnecessary_columns
)

In [8]:
import optuna
import lightgbm as lgb
from src.features.preprocess import Preprocessor
from src.config.preprocess import PreprocessorConfig
from src.suggest_params.preprocess import suggest_preprocessor_config
from src.suggest_params.anomaly_detection import suggest_lgb_params
from sklearn.metrics import roc_auc_score


def objective(
    trial: optuna.Trial,
) -> float:
    # Optunaのトライアルからパラメータを取得
    lgb_params = suggest_lgb_params(trial)
    preprocessor_config = suggest_preprocessor_config(trial, task="anomaly_detection")
    preprocessor = Preprocessor(**preprocessor_config.to_dict())
    train_df_preprocessed, val_df_preprocessed, test_df_preprocessed = preprocessor.run(
        train_df, val_df, test_df
    )

    # anomaly annotation
    anomaly_expr = (pl.col("price") > 40_000).alias("is_anomaly")
    train_df_preprocessed = train_df_preprocessed.with_columns(anomaly_expr)
    val_df_preprocessed = val_df_preprocessed.with_columns(anomaly_expr)

    # sampling
    val_df_sampled = pl.concat(
        [
            val_df_preprocessed.filter(~pl.col("is_anomaly")).sample(
                fraction=0.1, seed=1
            ),
            val_df_preprocessed.filter(pl.col("is_anomaly")),
        ],
        how="vertical",
    )

    # LightGBMモデルの学習と評価
    train_set = lgb.Dataset(
        train_df_preprocessed.drop(["price", "is_anomaly"]).to_pandas(),
        train_df_preprocessed["is_anomaly"].to_pandas(),
    )
    val_set = lgb.Dataset(
        val_df_sampled.drop(["price", "is_anomaly"]).to_pandas(),
        val_df_sampled["is_anomaly"].to_pandas(),
        reference=train_set,
    )
    model = lgb.train(
        lgb_params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
    )

    # バリデーションセットでの予測と評価
    val_pred = model.predict(val_df_sampled.drop(["price", "is_anomaly"]).to_pandas())
    val_label = val_df_sampled["is_anomaly"].to_numpy()

    # 予測とラベルの形状を確認
    assert val_pred.shape == val_label.shape, (
        "Shape mismatch between predictions and labels"
    )
    # AUCスコアの計算
    val_auc = roc_auc_score(val_label, val_pred)

    return val_auc


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)


[I 2025-07-23 22:25:58,004] A new study created in memory with name: no-name-fcda2cea-dd7b-4f68-bbec-732c3a38a7d0
[I 2025-07-23 22:25:58,662] Trial 0 finished with value: 0.9057945254177036 and parameters: {'num_leaves': 75, 'max_depth': 42, 'learning_rate': 0.13803184853923406, 'n_estimators': 92, 'min_child_samples': 64, 'feature_fraction': 0.8022762170655268, 'bagging_fraction': 0.5099965488954892, 'bagging_freq': 10, 'smoothing': 0.1038081695616358, 'min_samples_leaf': 6, 'noise_level': 0.5128790617747492}. Best is trial 0 with value: 0.9057945254177036.
[I 2025-07-23 22:25:58,799] Trial 1 finished with value: 0.9057945254177034 and parameters: {'num_leaves': 58, 'max_depth': 1, 'learning_rate': 0.04297304278022884, 'n_estimators': 186, 'min_child_samples': 43, 'feature_fraction': 0.6506136961473634, 'bagging_fraction': 0.4844232851259148, 'bagging_freq': 4, 'smoothing': 0.6370461898768259, 'min_samples_leaf': 2, 'noise_level': 0.643264331664785}. Best is trial 0 with value: 0.9057

In [None]:
# save best preprocessor config as yaml
from src.config.preprocess import TargetEncoderConfig
from src.suggest_params.preprocess import (
    create_condition_config,
    create_cylinder_config,
    create_fuel_config,
    create_transmission_config,
    create_drive_config,
    create_manufacturer_config,
    create_paint_color_config,
    create_state_config,
    create_type_config,
)


def create_preprocessor_config_from_study(study: optuna.Study) -> PreprocessorConfig:
    best_target_encoder_config = TargetEncoderConfig(
        smoothing=study.best_params["smoothing"],
        min_samples_leaf=study.best_params["min_samples_leaf"],
        noise_level=study.best_params["noise_level"],
    )

    # 各種エンコーダーの設定を作成
    condition_encoder_config = create_condition_config(best_target_encoder_config)
    cylinder_encoder_config = create_cylinder_config(best_target_encoder_config)
    fuel_encoder_config = create_fuel_config(best_target_encoder_config)
    transmission_encoder_config = create_transmission_config(best_target_encoder_config)
    drive_encoder_config = create_drive_config(best_target_encoder_config)
    manufacturer_encoder_config = create_manufacturer_config(best_target_encoder_config)
    paint_color_encoder_config = create_paint_color_config(best_target_encoder_config)
    state_encoder_config = create_state_config(best_target_encoder_config)
    type_encoder_config = create_type_config(best_target_encoder_config)

    # 最適な前処理設定を作成
    best_preprocessor_config = PreprocessorConfig(
        condition_encoder_config=condition_encoder_config,
        cylinder_encoder_config=cylinder_encoder_config,
        fuel_encoder_config=fuel_encoder_config,
        transmission_encoder_config=transmission_encoder_config,
        drive_encoder_config=drive_encoder_config,
        manufacturer_encoder_config=manufacturer_encoder_config,
        paint_color_encoder_config=paint_color_encoder_config,
        state_encoder_config=state_encoder_config,
        type_encoder_config=type_encoder_config,
        price_lower_bound=0.0,
        price_upper_bound=float("inf"),
        remove_outliers_val=False,
    )

    return best_preprocessor_config


def create_lgb_params_from_study(study: optuna.Study) -> dict:
    return {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_leaves": study.best_params["num_leaves"],
        "max_depth": study.best_params["max_depth"],
        "learning_rate": study.best_params["learning_rate"],
        "n_estimators": study.best_params["n_estimators"],
        "min_child_samples": study.best_params["min_child_samples"],
        "feature_fraction": study.best_params["feature_fraction"],
        "bagging_fraction": study.best_params["bagging_fraction"],
        "bagging_freq": study.best_params["bagging_freq"],
    }


def save_preprocessor_config(study: optuna.Study, output_path: Path) -> None:
    output_path.mkdir(parents=True, exist_ok=True)

    best_preprocessor_config = create_preprocessor_config_from_study(study)
    best_preprocessor_config.to_yaml(
        output_path / "best_preprocessor_config_anomaly.yaml"
    )


def save_lgb_params(study: optuna.Study, output_path: Path) -> None:
    output_path.mkdir(parents=True, exist_ok=True)
    best_lgb_params = create_lgb_params_from_study(study)

    with open(output_path / "best_lgb_params_anomaly.yaml", "w") as f:
        for key, value in best_lgb_params.items():
            f.write(f"{key}: {value}\n")


save_preprocessor_config(study, Path("../params/"))
save_lgb_params(study, Path("../params/"))

In [None]:
# 最適な前処理設定を取得
best_preprocessor_config = create_preprocessor_config_from_study(study)
# 最適なLightGBMパラメータを取得
best_lgb_params = create_lgb_params_from_study(study)

preprocessor = Preprocessor(**best_preprocessor_config.to_dict())
train_df_preprocessed, val_df_preprocessed, test_df_preprocessed = preprocessor.run(
    train_df, val_df, test_df
)

# anomaly annotation
anomaly_expr = (pl.col("price") > 40_000).alias("is_anomaly")
train_df_preprocessed = train_df_preprocessed.with_columns(anomaly_expr)
val_df_preprocessed = val_df_preprocessed.with_columns(anomaly_expr)


# LightGBMモデルの学習と評価
train_set = lgb.Dataset(
    train_df_preprocessed.drop(["price", "is_anomaly"]).to_pandas(),
    train_df_preprocessed["is_anomaly"].to_pandas(),
)
val_set = lgb.Dataset(
    val_df_preprocessed.drop(["price", "is_anomaly"]).to_pandas(),
    val_df_preprocessed["is_anomaly"].to_pandas(),
    reference=train_set,
)
model = lgb.train(
    best_lgb_params,
    train_set,
    num_boost_round=1000,
    valid_sets=[val_set],
)

# train, val set での予測と評価
train_pred = model.predict(
    train_df_preprocessed.drop(["price", "is_anomaly"]).to_pandas()
)
train_label = train_df_preprocessed["is_anomaly"].to_numpy()

val_pred = model.predict(val_df_preprocessed.drop(["price", "is_anomaly"]).to_pandas())
val_label = val_df_preprocessed["is_anomaly"].to_numpy()

[0.00683592 0.00697974 0.00696662 0.00849757 0.01011643 0.08395109
 0.00681766 0.01455772 0.00686763 0.01319871 0.6413734  0.01040518
 0.00683592 0.00681766 0.15880052 0.00695444 0.02793188 0.0068197
 0.00681766 0.00844863 0.00968382 0.00681766 0.00773584 0.00697884
 0.00903487 0.00881187 0.00734197 0.00782372 0.06707708 0.0203202
 0.00864273 0.0068197  0.00699861 0.00783512 0.01082331 0.0076656
 0.0079003  0.0068142  0.00684323 0.01485116 0.01242797 0.00843759
 0.00681766 0.00684323 0.02118944 0.02473158 0.01200031 0.00738487
 0.00683592 0.0070683  0.0068197  0.00714996 0.00821736 0.00804266
 0.00776673 0.00710389 0.00697649 0.04395076 0.00983148 0.07615679
 0.03118235 0.00687506 0.00740416 0.12140152 0.04063284 0.00775243
 0.00681766 0.05130535 0.00729735 0.00919157 0.00694554 0.19137058
 0.00693587 0.00703314 0.00699368 0.01962918 0.0068197  0.00684073
 0.08925947 0.00916722 0.00742226 0.03231415 0.00716335 0.03836413
 0.06838192 0.00682723 0.07466982 0.00970741 0.01141968 0.0068176