original: https://www.guruguru.science/competitions/25/discussions/8b97734b-1f76-4075-b1af-5d227d6b70e8/ (@yururoi)


In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/bin/bash: line 1: /usr/bin/rm: Argument list too long
/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "xgboost"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-24 02:36:46.721357: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-24 02:36:46.747362: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_025


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import glob
import os
import pickle
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


In [5]:
class Config:
    N_FOLD = 5
    RANDOM_SATE = 42


NB = "exp1015"

In [6]:
ROOT_DIR = Path("/kaggle")
DATA_DIR = ROOT_DIR / Path("input/atmaCup#18_dataset")

In [7]:
train_df = pl.read_csv(DATA_DIR / "train_features.csv")
test_df = pl.read_csv(DATA_DIR / "test_features.csv")

## 特徴量生成


In [8]:
# train_df と test_dfを結合（特徴量エンジニアリングをしやすくするため）
_all_df = pl.concat([train_df, test_df], how="diagonal")

In [9]:
agg_cols = [
    "vEgo",
    "aEgo",
    "steeringAngleDeg",
    "steeringTorque",
    "gas",
]  # 同一シーンから集計する値のカラム名

# 同一シーンから特徴量作成
exprs = []
exprs += [
    pl.col(agg_col).shift(-1).over("scene").alias(f"{agg_col}_shift-1") for agg_col in agg_cols
]  # 1ステップ前の時間の値
exprs += [
    pl.col(agg_col).shift(1).over("scene").alias(f"{agg_col}_shift1") for agg_col in agg_cols
]  # 1ステップ後の時間の値
exprs += [
    pl.col(agg_col).diff(-1).over("scene").alias(f"{agg_col}_diff-1") for agg_col in agg_cols
]  # 1ステップ前の時間の値との差分
exprs += [
    pl.col(agg_col).diff(1).over("scene").alias(f"{agg_col}_diff1") for agg_col in agg_cols
]  # 1ステップ後の時間の値との差分
exprs += [pl.col(agg_col).mean().over("scene").alias(f"{agg_col}_mean") for agg_col in agg_cols]  # 同一シーンの平均値
exprs += [pl.col(agg_col).std().over("scene").alias(f"{agg_col}_std") for agg_col in agg_cols]  # 同一シーンの標準偏差
exprs += [pl.col(agg_col).max().over("scene").alias(f"{agg_col}_max") for agg_col in agg_cols]  # 同一シーンの最大値
exprs += [pl.col(agg_col).min().over("scene").alias(f"{agg_col}_min") for agg_col in agg_cols]  # 同一シーンの最小値

_all_df = (
    _all_df.with_columns(
        # ID からシーンとデシ秒を作成
        pl.col("ID").str.split("_").list.get(0).alias("scene"),
        pl.col("ID").str.split("_").list.get(1).cast(pl.Int32).alias("decisecond"),
    )
    .sort(
        # shiftと diffが時系列順に並んでいる必要があるためシーンごとに時間軸でソート
        "scene",
        "decisecond",
    )
    .with_columns(exprs)
)

In [10]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"]).rename({"sceneID": "scene"})
_all_df = _all_df.join(train_folds, how="left", on="scene")
# assert train_df["fold"].null_count() == 0

In [11]:
# YOLOの検出結果
import json

yolo_paths = glob.glob("/kaggle/input/yolo-det/det/*.json")
yolo_dfs = []
for path in tqdm(yolo_paths):
    ID = os.path.basename(path).split(".")[0]
    with open(path, "r") as f:
        data = json.load(f)

    yolo_feature = {
        "ID": ID,
        "num_objects": len(data),
    }
    for bbox in data:
        if bbox["x1"] == bbox["x2"] or bbox["y1"] == bbox["y2"]:
            continue

        if bbox["cls"] != "car":
            continue

        # count
        if bbox["cls"] not in yolo_feature:
            yolo_feature[bbox["cls"]] = 0
        yolo_feature[bbox["cls"]] += 1

        # 最も横方向が中央にあるものの情報
        if f"center_x_{bbox['cls']}" not in yolo_feature:
            yolo_feature[f"center_x_{bbox['cls']}"] = -1
        current_dist = abs(yolo_feature[f"center_x_{bbox['cls']}"] - 64)
        now_center_x = (bbox["x1"] + bbox["x2"]) / 2
        now_dist = abs(now_center_x - 64)
        if now_dist < current_dist:
            yolo_feature[f"center_x_{bbox['cls']}"] = now_center_x
            yolo_feature[f"center_y_{bbox['cls']}"] = (bbox["y1"] + bbox["y2"]) / 2
            yolo_feature[f"width_{bbox['cls']}"] = bbox["x2"] - bbox["x1"]
            yolo_feature[f"height_{bbox['cls']}"] = bbox["y2"] - bbox["y1"]
            yolo_feature[f"bottom_{bbox['cls']}"] = bbox["y2"]
            yolo_feature[f"area_{bbox['cls']}"] = (bbox["x2"] - bbox["x1"]) * (bbox["y2"] - bbox["y1"])
            yolo_feature[f"aspect_ratio_{bbox['cls']}"] = (bbox["x2"] - bbox["x1"]) / (bbox["y2"] - bbox["y1"])
            yolo_feature[f"conf_{bbox['cls']}"] = bbox["conf"]
    yolo_dfs.append(yolo_feature)
yolo_df = pl.DataFrame(yolo_dfs)

_all_df = _all_df.join(yolo_df, how="left", on="ID")

  0%|          | 0/45098 [00:00<?, ?it/s]

100%|██████████| 45098/45098 [00:00<00:00, 78671.65it/s]


In [12]:
# depth
import cv2

depth_features = []
for ID in tqdm(_all_df["ID"]):
    path = f"/kaggle/input/depth_image/depth/{ID}/0.png"
    image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)

    patch_size = 16
    this_features = {"ID": ID}
    for i in range(0, 128, patch_size):
        for j in range(0, 64, patch_size):
            patch = image[i : i + patch_size, j : j + patch_size]
            this_features[f"patch_{i}_{j}_mean"] = np.mean(patch)
            this_features[f"patch_{i}_{j}_median"] = np.median(patch)
    depth_features.append(this_features)
depth_df = pl.DataFrame(depth_features)
_all_df = _all_df.join(depth_df, how="left", on="ID")

  0%|          | 0/45098 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 45098/45098 [00:18<00:00, 2479.01it/s]


## feature and target


In [13]:
targets = [
    "x_0",
    "y_0",
    "z_0",
    "x_1",
    "y_1",
    "z_1",
    "x_2",
    "y_2",
    "z_2",
    "x_3",
    "y_3",
    "z_3",
    "x_4",
    "y_4",
    "z_4",
    "x_5",
    "y_5",
    "z_5",
]

# 使う特徴量を指定するより使わない特徴量を指定するほうが試行錯誤が楽
del_columns = targets + ["ID", "scene", "gearShifter", "fold"]

features = list(set(_all_df.columns) - set(del_columns))
features.sort()

print(features)

['aEgo', 'aEgo_diff-1', 'aEgo_diff1', 'aEgo_max', 'aEgo_mean', 'aEgo_min', 'aEgo_shift-1', 'aEgo_shift1', 'aEgo_std', 'area_car', 'aspect_ratio_car', 'bottom_car', 'brake', 'brakePressed', 'car', 'center_x_car', 'center_y_car', 'conf_car', 'decisecond', 'gas', 'gasPressed', 'gas_diff-1', 'gas_diff1', 'gas_max', 'gas_mean', 'gas_min', 'gas_shift-1', 'gas_shift1', 'gas_std', 'height_car', 'leftBlinker', 'num_objects', 'patch_0_0_mean', 'patch_0_0_median', 'patch_0_16_mean', 'patch_0_16_median', 'patch_0_32_mean', 'patch_0_32_median', 'patch_0_48_mean', 'patch_0_48_median', 'patch_112_0_mean', 'patch_112_0_median', 'patch_112_16_mean', 'patch_112_16_median', 'patch_112_32_mean', 'patch_112_32_median', 'patch_112_48_mean', 'patch_112_48_median', 'patch_16_0_mean', 'patch_16_0_median', 'patch_16_16_mean', 'patch_16_16_median', 'patch_16_32_mean', 'patch_16_32_median', 'patch_16_48_mean', 'patch_16_48_median', 'patch_32_0_mean', 'patch_32_0_median', 'patch_32_16_mean', 'patch_32_16_median', 

In [14]:
# MAEを計算
def evaluation(true_values, pred_values):
    abs_diff = abs(true_values - pred_values)
    mae = np.mean(
        abs_diff.reshape(
            -1,
        )
    )
    return mae

### encoding


In [15]:
# label encdoding
categorical_columns = ["gearShifter"]

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    _all_df = _all_df.with_columns(pl.Series(le.fit_transform(_all_df[col])).alias(f"{col}_le"))
cate_features = [f"{col}_le" for c in categorical_columns]
features = list(set(features) | set(cate_features))

# count encoding
count_enc = ["gearShifter"]
_all_df = _all_df.with_columns([pl.col(c).count().over(c).alias(f"{c}_count") for c in count_enc])
count_features = [f"{c}_count" for c in count_enc]
features = list(set(features) | set(count_features))


train_df = train_df.join(_all_df, how="left", on="ID")
test_df = test_df.join(_all_df, how="left", on="ID")

In [16]:
def train_lgbm(target):
    params = {
        "objective": "reg:squarederror",
        "random_state": Config.RANDOM_SATE,
        "learning_rate": 0.01,
        # "max_depth": 7,
        # "colsample_bytree": 0.7,
        "reg_alpha": 1,
        "reg_lambda": 1,
        # "subsample": 0.7,
        "tree_method": "hist",
        "device": "cuda",
    }

    oof_pred = np.zeros(len(train_df))
    y_pred = np.zeros(len(test_df))
    models = []
    cv_scores = {}

    for fold in range(5):
        print(f"fold{fold}: ", end="")

        # TrainとTestに分割
        x_train = train_df.filter(pl.col("fold") != fold).select(features)
        x_val = train_df.filter(pl.col("fold") == fold).select(features)
        y_train = train_df.filter(pl.col("fold") != fold).select(target)
        y_val = train_df.filter(pl.col("fold") == fold).select(target)

        test = test_df[features]

        # create Dataset
        train_set = xgb.DMatrix(
            x_train.to_pandas(),
            label=y_train.to_pandas(),
            enable_categorical=False,
        )
        val_set = xgb.DMatrix(
            x_val.to_pandas(),
            label=y_val.to_pandas(),
            enable_categorical=False,
        )

        # train
        model = xgb.train(
            params,
            train_set,
            evals=[(val_set, "eval")],
            num_boost_round=10000,
            early_stopping_rounds=100,
            verbose_eval=500000,
        )

        models.append(model)

        fold_pred = model.predict(xgb.DMatrix(x_val.to_pandas(), enable_categorical=False))

        score = evaluation(y_val.to_numpy().reshape(-1), fold_pred)
        cv_scores[f"cv{fold}"] = score

        # oof_pred[test_index] = fold_pred
        oof_pred[train_df["fold"].to_numpy() == fold] = fold_pred

        y_pred += model.predict(xgb.DMatrix(test.to_pandas(), enable_categorical=False)) / Config.N_FOLD

        print(f"{score}")

    oof_score = evaluation(train_df[target].to_numpy().reshape(-1), oof_pred)
    print(f"OOF score is {oof_score}")

    return oof_pred, y_pred, models

## 学習


In [17]:
# def add_dt_features(train: pl.DataFrame):
#     """dt秒後の特徴"""
#     train = train.with_columns(
#         # vt
#         (pl.col("vEgo") * pl.col("dt").cast(pl.Float32)).alias("linear_movement@dt"),
#         # vt + 0.5at^2
#         ((pl.col("vEgo") + 0.5 * pl.col("aEgo") * pl.col("dt").cast(pl.Float32) ** 2).alias("movement@dt")),
#         # v + at
#         (pl.col("vEgo") + pl.col("aEgo") * pl.col("dt").cast(pl.Float32)).alias("velocity@dt"),
#     )
#     return train

sub_col_names = [
    "x_0",
    "y_0",
    "z_0",
    "x_1",
    "y_1",
    "z_1",
    "x_2",
    "y_2",
    "z_2",
    "x_3",
    "y_3",
    "z_3",
    "x_4",
    "y_4",
    "z_4",
    "x_5",
    "y_5",
    "z_5",
]

for seed in range(10):
    # cv split
    all_scenes = train_df["scene"].unique().to_numpy()
    np.random.seed(seed)
    np.random.shuffle(all_scenes)
    scene2fold = {}
    for i, scene in enumerate(all_scenes):
        scene2fold[scene] = i % 5
    fold_df = pl.DataFrame({"scene": all_scenes, "fold": [scene2fold[s] for s in all_scenes]})
    train_df = train_df.drop("fold").join(fold_df, how="left", on="scene")
    assert train_df["fold"].null_count() == 0

    # train
    models_dict = {}
    test_pred = []
    oof_pred = []
    for target in targets:
        print("=" * 50)
        print(f"# {target}")
        print("=" * 50)

        # # dt features
        # dt = float(target.split("_")[-1]) * 0.5 + 0.5
        # train_df = train_df.with_columns(pl.lit(dt).alias("dt"))
        # test_df = test_df.with_columns(pl.lit(dt).alias("dt"))
        # train_df = add_dt_features(train_df)
        # test_df = add_dt_features(test_df)
        # features = list(set(features) | set(["linear_movement@dt", "movement@dt", "velocity@dt"]))

        oof_preds_partial, y_pred_partial, models_partial = train_lgbm(target)
        models_dict[target] = models_partial
        oof_pred.append(oof_preds_partial)
        test_pred.append(y_pred_partial)

    sub_df = pl.DataFrame(np.vstack(test_pred).T, schema=sub_col_names)
    sub_df = sub_df.with_columns(pl.Series("ID", test_df["ID"]))
    sub_df.write_csv(os.path.join(CFG["output_dir"], f"submission_seed{seed}.csv"))

    oof_df = pl.DataFrame(np.vstack(oof_pred).T, schema=sub_col_names)
    oof_df = oof_df.with_columns(pl.Series("ID", train_df["ID"]))
    oof_df.write_csv(os.path.join(CFG["output_dir"], f"oof_seed{seed}.csv"))
    print(evaluation(train_df[targets].to_numpy(), np.vstack(oof_pred).T))

# x_0
fold0: [0]	eval-rmse:3.21905
[940]	eval-rmse:0.09770
0.061770284278885264
fold1: [0]	eval-rmse:3.22951
[1180]	eval-rmse:0.10247
0.061804641267924766
fold2: [0]	eval-rmse:3.20037
[1992]	eval-rmse:0.10656
0.06356412970802873
fold3: [0]	eval-rmse:3.28369
[1243]	eval-rmse:0.09831
0.062105755253847206
fold4: [0]	eval-rmse:3.23773
[1324]	eval-rmse:0.09880
0.06222952644471928
OOF score is 0.06229452575801168
# y_0
fold0: [0]	eval-rmse:0.11279
[619]	eval-rmse:0.05579
0.032283938168854595
fold1: [0]	eval-rmse:0.10932
[626]	eval-rmse:0.05076
0.032288995949071655
fold2: [0]	eval-rmse:0.12053
[792]	eval-rmse:0.06867
0.03303467549680738
fold3: [0]	eval-rmse:0.12298
[1160]	eval-rmse:0.07039
0.03247051210817366
fold4: [0]	eval-rmse:0.11434
[713]	eval-rmse:0.06290
0.0318219850904081
OOF score is 0.032379859154523676
# z_0
fold0: [0]	eval-rmse:0.04074
[231]	eval-rmse:0.04020
0.02612758132845812
fold1: [0]	eval-rmse:0.03948
[796]	eval-rmse:0.03831
0.026089707011542184
fold2: [0]	eval-rmse:0.04305


In [18]:
files = glob.glob(os.path.join(CFG["output_dir"], "oof*.csv"))
oof_df = pl.read_csv(files[0])
for c in sub_col_names:
    oof_df = oof_df.with_columns(pl.lit(0).alias(c))
for f in files:
    df = pl.read_csv(f)
    for c in sub_col_names:
        oof_df = oof_df.with_columns(oof_df[c] + df[c] / len(files))

oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))
mae = evaluation(train_df[targets].to_numpy(), oof_df[sub_col_names].to_numpy())
mae

0.2036799677680276

In [19]:
files = glob.glob(os.path.join(CFG["output_dir"], "sub*.csv"))
oof_df = pl.read_csv(files[0])
for c in sub_col_names:
    oof_df = oof_df.with_columns(pl.lit(0).alias(c))
for f in files:
    df = pl.read_csv(f)
    for c in sub_col_names:
        oof_df = oof_df.with_columns(oof_df[c] + df[c] / len(files))

oof_df.write_csv(os.path.join(CFG["output_dir"], "submission.csv"))