In [None]:
import polars as pl
import numpy as np

import lightgbm as lgb
from src.features.preprocess import Preprocessor
from src.metrics import rmse
from src.config.default_config import get_default_config

train_df = pl.read_csv("../dataset/projectA_vehicle_train.csv").drop(
    "posting_date", "id"
)
val_df = pl.read_csv("../dataset/projectA_vehicle_val.csv").drop("posting_date", "id")
test_df = pl.read_csv("../dataset/projectA_vehicle_test.csv").drop("posting_date", "id")


# regression model with outlier removed dataset


In [None]:
def train_regression_model(
    train_df, val_df, test_df
) -> tuple[lgb.Booster, pl.Series, pl.Series, pl.Series]:
    # デフォルト設定を取得
    config = get_default_config()

    # 異常検知用に価格の境界を調整（外れ値を除去しない）_config,
    config.price_lower_bound = 1_000
    config.price_upper_bound = 40_000
    config.remove_outliers_val = True

    # Preprocessorを作成
    preprocessor = Preprocessor(**config.to_dict())

    train_df_preprocessed, val_df_preprocessed, test_df_preprocessed = preprocessor.run(
        train_df, val_df, test_df
    )

    # dataset
    train_set = lgb.Dataset(
        train_df_preprocessed.drop("price").to_pandas(),
        train_df_preprocessed["price"].to_pandas(),
    )
    val_set = lgb.Dataset(
        val_df_preprocessed.drop("price").to_pandas(),
        val_df_preprocessed["price"].to_pandas(),
        reference=train_set,
    )

    # モデルの学習
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.05,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
    }

    # モデルの学習
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
    )

    # 予測
    y_pred_train = model.predict(train_df_preprocessed.drop("price").to_pandas())
    y_pred_val = model.predict(val_df_preprocessed.drop("price").to_pandas())
    y_pred_test = model.predict(test_df_preprocessed.drop("price").to_pandas())

    # 評価
    train_rmse = rmse(train_df_preprocessed["price"].to_numpy(), y_pred_train)
    val_rmse = rmse(val_df_preprocessed["price"].to_numpy(), y_pred_val)
    test_rmse = rmse(test_df_preprocessed["price"].to_numpy(), y_pred_test)

    print(f"Train RMSE: {train_rmse:,.0f}")
    print(f"Validation RMSE: {val_rmse:,.0f}")
    print(f"Test RMSE: {test_rmse:,.0f}")

    return model


In [100]:
regression_model = train_regression_model(train_df, val_df, test_df)

Train RMSE: 1,313
Validation RMSE: 4,358
Test RMSE: 7,508


# anomaly detection model

In [104]:
from sklearn.metrics import precision_score, recall_score


def train_anomaly_detection_model(
    train_df, val_df, test_df
) -> tuple[lgb.Booster, pl.Series, pl.Series, pl.Series]:
    # デフォルト設定を取得
    config = get_default_config()

    # 異常検知用に価格の境界を調整（外れ値を除去しない）_config,
    config.price_lower_bound = 0
    config.price_upper_bound = float("inf")  # 無限大に設定
    config.remove_outliers_val = False

    # Preprocessorを作成
    preprocessor = Preprocessor(**config.to_dict())
    print(f"Price bounds: {config.price_lower_bound} - {config.price_upper_bound}")
    print(f"Remove outliers: {config.remove_outliers_val}")

    # preprocess the data again
    train_df_preprocessed, val_df_preprocessed, test_df_preprocessed = preprocessor.run(
        train_df, val_df, test_df
    )

    # anomaly anotation
    anomaly_expr = (pl.col("price") > 40_000).alias(
        "is_anomaly"
    )  # 価格が40,000を超える場合は外れ値とする

    train_df_preprocessed = train_df_preprocessed.with_columns(anomaly_expr)
    val_df_preprocessed = val_df_preprocessed.with_columns(anomaly_expr)
    test_df_preprocessed = test_df_preprocessed.with_columns(anomaly_expr)

    # モデルの学習
    train_set = lgb.Dataset(
        train_df_preprocessed.drop("is_anomaly", "price").to_pandas(),
        train_df_preprocessed["is_anomaly"].to_pandas(),
    )
    val_set = lgb.Dataset(
        val_df_preprocessed.drop("is_anomaly", "price").to_pandas(),
        val_df_preprocessed["is_anomaly"].to_pandas(),
        reference=train_set,
    )

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.05,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
    }

    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
    )

    # 予測
    y_pred_train = model.predict(
        train_df_preprocessed.drop("is_anomaly", "price").to_pandas()
    )
    y_pred_val = model.predict(
        val_df_preprocessed.drop("is_anomaly", "price").to_pandas()
    )
    y_pred_test = model.predict(
        test_df_preprocessed.drop("is_anomaly", "price").to_pandas()
    )

    # 閾値
    threshold = 0.2
    print(f"Threshold for anomaly detection: {threshold}")

    # precision
    train_precision = precision_score(
        train_df_preprocessed["is_anomaly"].to_numpy(),
        (y_pred_train > threshold).astype(int),
    )
    val_precision = precision_score(
        val_df_preprocessed["is_anomaly"].to_numpy(),
        (y_pred_val > threshold).astype(int),
    )
    test_precision = precision_score(
        test_df_preprocessed["is_anomaly"].to_numpy(),
        (y_pred_test > threshold).astype(int),
    )
    print("=============Precision=============")
    print(f"Train Precision: {train_precision:.2f}")
    print(f"Validation Precision: {val_precision:.2f}")
    print(f"Test Precision: {test_precision:.2f}")

    # recall
    train_recall = recall_score(
        train_df_preprocessed["is_anomaly"].to_numpy(),
        (y_pred_train > threshold).astype(int),
    )
    val_recall = recall_score(
        val_df_preprocessed["is_anomaly"].to_numpy(),
        (y_pred_val > threshold).astype(int),
    )
    test_recall = recall_score(
        test_df_preprocessed["is_anomaly"].to_numpy(),
        (y_pred_test > threshold).astype(int),
    )
    print("=============Recall=============")
    print(f"Train Recall: {train_recall:.2f}")
    print(f"Validation Recall: {val_recall:.2f}")
    print(f"Test Recall: {test_recall:.2f}")

    return model


anomaly_detection_model = train_anomaly_detection_model(train_df, val_df, test_df)

Price bounds: 0 - inf
Remove outliers: False
Threshold for anomaly detection: 0.2
Train Precision: 1.00
Validation Precision: 0.52
Test Precision: 0.80
Train Recall: 1.00
Validation Recall: 0.38
Test Recall: 0.57


In [105]:
# preprocess the data again
train_df_preprocessed, val_df_preprocessed, test_df_preprocessed = preprocessor.run(
    train_df, val_df, test_df
)

# anomaly anotation
anomaly_expr = (pl.col("price") > 40_000).alias(
    "is_anomaly"
)  # 価格が40,000を超える場合は外れ値とする

train_df_preprocessed = train_df_preprocessed.with_columns(anomaly_expr)
val_df_preprocessed = val_df_preprocessed.with_columns(anomaly_expr)
test_df_preprocessed = test_df_preprocessed.with_columns(anomaly_expr)


# モデルの学習
train_set = lgb.Dataset(
    train_df_preprocessed.drop("is_anomaly", "price").to_pandas(),
    train_df_preprocessed["is_anomaly"].to_pandas(),
)
val_set = lgb.Dataset(
    val_df_preprocessed.drop("is_anomaly", "price").to_pandas(),
    val_df_preprocessed["is_anomaly"].to_pandas(),
    reference=train_set,
)

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
}

model = lgb.train(
    params,
    train_set,
    num_boost_round=1000,
    valid_sets=[val_set],
)

# Emsamble

In [89]:
constant_prediction = train_df.filter(pl.col("price") > 40_000)["price"].quantile(0.9)
print(f"Constant prediction for anomalies: {constant_prediction:,.0f}")

Constant prediction for anomalies: 69,500


In [None]:
alpha = 0.5  # 定数の重み付け係数
weight = y_pred_val**alpha

ensemble_prediction_train = weight * y_pred_train + (1 - weight) * constant_prediction
ensemble_prediction_val = weight * y_pred_val + (1 - weight) * constant_prediction

# rmse
train_rmse = rmse(
    train_df_preprocessed["price"].to_pandas(),
    ensemble_prediction_train,
)
val_rmse = rmse(
    val_df_preprocessed["price"].to_pandas(),
    ensemble_prediction_val,
)

In [None]:
def main():
    # Load the datasets
    train_df = pl.read_csv("../dataset/projectA_vehicle_train.csv").drop(
        "posting_date", "id"
    )
    val_df = pl.read_csv("../dataset/projectA_vehicle_val.csv").drop(
        "posting_date", "id"
    )
    test_df = pl.read_csv("../dataset/projectA_vehicle_test.csv").drop(
        "posting_date", "id"
    )

    # data for regression
    train_df_reg, val_df_reg, test_df_reg = preprocess_for_regression(
        train_df, val_df, test_df
    )
    price_pred_train, price_pred_val, price_pred_test = run_regression_model(
        train_df_reg, val_df_reg, test_df_reg
    )

    # data for anomaly detection
    train_df_anomaly, val_df_anomaly, test_df_anomaly = (
        preprocess_for_anomaly_detection(train_df, val_df, test_df)
    )
    anomaly_pred_train, anomaly_pred_val, anomaly_pred_test = (
        run_anomaly_detection_model(train_df_anomaly, val_df_anomaly, test_df_anomaly)
    )

    constant_prediction = 60_000
    alpha = 0.5  # 定数の重み付け係数

    ensembled_prediction_train = ensemble_predictions(
        price_pred_train, anomaly_pred_train, constant_prediction, alpha
    )
    ensembled_prediction_val = ensemble_predictions(
        price_pred_val, anomaly_pred_val, constant_prediction, alpha
    )
    ensembled_prediction_test = ensemble_predictions(
        price_pred_test, anomaly_pred_test, constant_prediction, alpha
    )

    # rmse
    train_rmse = rmse(
        train_df_reg["price"].to_numpy(),
        ensembled_prediction_train,
    )
    val_rmse = rmse(
        val_df_reg["price"].to_numpy(),
        ensembled_prediction_val,
    )
    test_rmse = rmse(
        test_df_reg["price"].to_numpy(),
        ensembled_prediction_test,
    )

    print(f"Ensembled Train RMSE: {train_rmse:,.0f}")
    print(f"Ensembled Validation RMSE: {val_rmse:,.0f}")
    print(f"Ensembled Test RMSE: {test_rmse:,.0f}")


def preprocess_for_regression(
    train_df: pl.DataFrame,
    val_df: pl.DataFrame,
    test_df: pl.DataFrame,
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
    config = get_default_config()

    # 異常検知用に価格の境界を調整（外れ値を除去しない）_config,
    config.price_lower_bound = 0
    config.price_upper_bound = float("inf")  # 無限大に設定
    config.remove_outliers_val = False

    # Preprocessorを作成
    preprocessor = Preprocessor(**config.to_dict())

    train_df_preprocessed, val_df_preprocessed, test_df_preprocessed = preprocessor.run(
        train_df, val_df, test_df
    )

    return train_df_preprocessed, val_df_preprocessed, test_df_preprocessed


def preprocess_for_anomaly_detection(
    train_df: pl.DataFrame,
    val_df: pl.DataFrame,
    test_df: pl.DataFrame,
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
    config = get_default_config()

    # 異常検知用に価格の境界を調整（外れ値を除去しない）_config,
    config.price_lower_bound = 0
    config.price_upper_bound = float("inf")  # 無限大に設定
    config.remove_outliers_val = False

    # Preprocessorを作成
    preprocessor = Preprocessor(**config.to_dict())
    print(f"Price bounds: {config.price_lower_bound} - {config.price_upper_bound}")
    print(f"Remove outliers: {config.remove_outliers_val}")

    # preprocess the data again
    train_df_preprocessed, val_df_preprocessed, test_df_preprocessed = preprocessor.run(
        train_df, val_df, test_df
    )

    # anomaly anotation
    anomaly_expr = (pl.col("price") > 40_000).alias(
        "is_anomaly"
    )  # 価格が40,000を超える場合は外れ値とする

    train_df_preprocessed = train_df_preprocessed.with_columns(anomaly_expr).drop(
        "price"
    )
    val_df_preprocessed = val_df_preprocessed.with_columns(anomaly_expr).drop("price")
    test_df_preprocessed = test_df_preprocessed.with_columns(anomaly_expr).drop("price")

    return train_df_preprocessed, val_df_preprocessed, test_df_preprocessed


def remove_outliers(
    df: pl.DataFrame,
    lower_bound: float,
    upper_bound: float,
) -> pl.DataFrame:
    return df.filter(
        (pl.col("price") >= lower_bound) & (pl.col("price") <= upper_bound)
    )


def run_regression_model(
    train_df: pl.DataFrame,
    val_df: pl.DataFrame,
    test_df: pl.DataFrame,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    train_df_filtered = remove_outliers(train_df, lower_bound=1_000, upper_bound=40_000)
    val_df_filtered = remove_outliers(val_df, lower_bound=1_000, upper_bound=40_000)

    # dataset
    train_set = lgb.Dataset(
        train_df_filtered.drop("price").to_pandas(),
        train_df_filtered["price"].to_pandas(),
    )
    val_set = lgb.Dataset(
        val_df_filtered.drop("price").to_pandas(),
        val_df_filtered["price"].to_pandas(),
        reference=train_set,
    )

    # モデルの学習
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.05,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
    }

    # モデルの学習
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
    )

    # 予測
    y_pred_train = model.predict(train_df.drop("price").to_pandas())
    y_pred_val = model.predict(val_df.drop("price").to_pandas())
    y_pred_test = model.predict(test_df.drop("price").to_pandas())

    # 評価
    train_rmse = rmse(train_df["price"].to_numpy(), y_pred_train)
    val_rmse = rmse(val_df["price"].to_numpy(), y_pred_val)
    test_rmse = rmse(test_df["price"].to_numpy(), y_pred_test)

    print(f"Train RMSE: {train_rmse:,.0f}")
    print(f"Validation RMSE: {val_rmse:,.0f}")
    print(f"Test RMSE: {test_rmse:,.0f}")

    return y_pred_train, y_pred_val, y_pred_test


def run_anomaly_detection_model(
    train_df: pl.DataFrame,
    val_df: pl.DataFrame,
    test_df: pl.DataFrame,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    # モデルの学習
    train_set = lgb.Dataset(
        train_df.drop("is_anomaly").to_pandas(),
        train_df["is_anomaly"].to_pandas(),
    )
    val_set = lgb.Dataset(
        val_df.drop("is_anomaly").to_pandas(),
        val_df["is_anomaly"].to_pandas(),
        reference=train_set,
    )

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.05,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
    }

    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
    )

    # 予測
    y_pred_train = model.predict(train_df.drop("is_anomaly").to_pandas())
    y_pred_val = model.predict(val_df.drop("is_anomaly").to_pandas())
    y_pred_test = model.predict(test_df.drop("is_anomaly").to_pandas())

    # 閾値
    threshold = 0.2
    print(f"Threshold for anomaly detection: {threshold}")

    # precision
    train_precision = precision_score(
        train_df["is_anomaly"].to_numpy(),
        (y_pred_train > threshold).astype(int),
    )
    val_precision = precision_score(
        val_df["is_anomaly"].to_numpy(),
        (y_pred_val > threshold).astype(int),
    )
    test_precision = precision_score(
        test_df["is_anomaly"].to_numpy(),
        (y_pred_test > threshold).astype(int),
    )
    print("=============Precision=============")
    print(f"Train Precision: {train_precision:.2f}")
    print(f"Validation Precision: {val_precision:.2f}")
    print(f"Test Precision: {test_precision:.2f}")

    # recall
    train_recall = recall_score(
        train_df["is_anomaly"].to_numpy(),
        (y_pred_train > threshold).astype(int),
    )
    val_recall = recall_score(
        val_df["is_anomaly"].to_numpy(),
        (y_pred_val > threshold).astype(int),
    )
    test_recall = recall_score(
        test_df["is_anomaly"].to_numpy(),
        (y_pred_test > threshold).astype(int),
    )
    print("=============Recall=============")
    print(f"Train Recall: {train_recall:.2f}")
    print(f"Validation Recall: {val_recall:.2f}")
    print(f"Test Recall: {test_recall:.2f}")

    return y_pred_train, y_pred_val, y_pred_test


def ensemble_predictions(
    price_predictions: np.ndarray,
    anomaly_predictions: np.ndarray,
    constant_prediction: float,
    alpha: float = 0.9,
) -> np.ndarray:
    """
    アンサンブル予測を行う関数
    :param price_predictions: 価格予測の配列
    :param anomaly_predictions: 異常検知の予測値の配列
    :param constant_prediction: 定数予測値
    :param alpha: 異常検知の重み付け係数
    0 <= alpha <= 1
    :return: アンサンブル予測の配列
    """
    weight = anomaly_predictions**alpha

    ensemble_prediction = (
        weight * price_predictions + (1 - weight) * constant_prediction
    )

    return ensemble_prediction

In [109]:
main()

Train RMSE: 12,422,463
Validation RMSE: 9,315
Test RMSE: 7,589
Price bounds: 0 - inf
Remove outliers: False
Threshold for anomaly detection: 0.2
Train Precision: 1.00
Validation Precision: 0.52
Test Precision: 0.79
Train Recall: 1.00
Validation Recall: 0.38
Test Recall: 0.53
Ensembled Train RMSE: 12,422,545
Ensembled Validation RMSE: 46,996
Ensembled Test RMSE: 47,500
