In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "xgboost"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-16 09:52:02.674959: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-16 09:52:02.701368: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_006


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import warnings

import numpy as np
import polars as pl

warnings.filterwarnings("ignore")

In [5]:
train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(pl.col("offset").cast(pl.Float32))
)
print(train.shape)
train.head(1)

(43371, 32)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0


## 特徴量生成


In [6]:
from penguinml.utils.contena import FeatureContena

features = FeatureContena()
basic_features = [
    "vEgo",
    "aEgo",
    "steeringAngleDeg",
    "steeringTorque",
    "brake",
    "brakePressed",
    "gas",
    "gasPressed",
    "leftBlinker",
    "rightBlinker",
    "offset",
]
for c in basic_features:
    train = train.with_columns(pl.col(c).cast(pl.Float32))
features.add_num_features(basic_features)
features.add_cat_features(["gearShifter"])

In [None]:
# 前後のフレームとのラグ特徴量
train = train.sort(["sceneID", "offset"])
for c in basic_features:
    for diff in range(-1, 0):
        if diff == 0:
            continue
        train = train.with_columns(
            pl.col(c).diff(n=diff).over("sceneID").alias(f"{c}_diff_{diff}"),
            pl.col(c).diff(n=diff).over("sceneID").alias(f"{c}_shift_{diff}"),
        )
        features.add_num_features([f"{c}_diff_{diff}", f"{c}_shift_{diff}"])

In [8]:
# train = train.with_columns(
#     (pl.col("vEgo") / pl.col("aEgo")).alias("vEgo/aEgo"),
# )
# features.add_num_features(["vEgo/aEgo"])

## ターゲット列を分解


In [9]:
from const import TARGET_COLS

train = (
    train.unpivot(index="ID", on=TARGET_COLS, variable_name="target_name", value_name="target")
    .join(
        train.drop(TARGET_COLS),
        on="ID",
        how="left",
    )
    .with_columns(
        (
            pl.col("target_name").map_elements(lambda x: float(x.split("_")[1]), return_dtype=pl.Float32) * 0.5 + 0.5
        ).alias("dt"),
        pl.col("target_name").map_elements(lambda x: x.split("_")[0], return_dtype=str).alias("xyz"),
    )
    .with_columns(
        pl.col("target_name").alias("target_name_original"),
    )
)
features.add_cat_features(["target_name", "xyz"])
train.head(1)

ID,target_name,target,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,sceneID,offset,vEgo_diff_-1,vEgo_shift_-1,aEgo_diff_-1,aEgo_shift_-1,steeringAngleDeg_diff_-1,steeringAngleDeg_shift_-1,steeringTorque_diff_-1,steeringTorque_shift_-1,brake_diff_-1,brake_shift_-1,brakePressed_diff_-1,brakePressed_shift_-1,gas_diff_-1,gas_shift_-1,gasPressed_diff_-1,gasPressed_shift_-1,leftBlinker_diff_-1,leftBlinker_shift_-1,rightBlinker_diff_-1,rightBlinker_shift_-1,offset_diff_-1,offset_shift_-1,dt,xyz,target_name_original
str,str,f64,f32,f32,f32,f32,f32,f32,f32,f32,str,f32,f32,str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,str
"""00066be8e20318869c38c66be46663…","""x_0""",2.82959,5.701526,1.538456,-2.165777,-139.0,0.0,0.0,0.25,1.0,"""drive""",0.0,0.0,"""00066be8e20318869c38c66be46663…",320.0,-5.474767,-5.474767,1.258575,1.258575,9.45992,9.45992,-95.0,-95.0,0.0,0.0,0.0,0.0,0.25,0.25,1.0,1.0,0.0,0.0,-1.0,-1.0,-100.0,-100.0,0.5,"""x""","""x_0"""


In [10]:
# dt秒後の特徴
train = train.with_columns(
    # vt
    (pl.col("vEgo") * pl.col("dt").cast(pl.Float32)).alias("linear_movement@dt"),
    # vt + 0.5at^2
    ((pl.col("vEgo") + 0.5 * pl.col("aEgo") * pl.col("dt").cast(pl.Float32) ** 2).alias("movement@dt")),
    # v + at
    (pl.col("vEgo") + pl.col("aEgo") * pl.col("dt").cast(pl.Float32)).alias("velocity@dt"),
    # # cos
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.cos(np.deg2rad(x)), return_dtype=pl.Float32)).alias("cos"),
    # # sin
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.sin(np.deg2rad(x)), return_dtype=pl.Float32)).alias("sin"),
).with_columns(
    # (pl.col("movement@dt") * pl.col("cos")).alias("movement@dt*cos"),
    # (pl.col("movement@dt") * pl.col("sin")).alias("movement@dt*sin"),
    # (pl.col("linear_movement@dt") * pl.col("cos")).alias("linear_movement@dt*cos"),
    # (pl.col("linear_movement@dt") * pl.col("sin")).alias("linear_movement@dt*sin"),
    # (pl.col("velocity@dt") * pl.col("cos")).alias("velocity@dt*cos"),
    # (pl.col("velocity@dt") * pl.col("sin")).alias("velocity@dt*sin"),
)

features.add_num_features(
    [
        "linear_movement@dt",
        "movement@dt",
        "velocity@dt",
        # "cos",
        # "sin",
        # "movement@dt*cos",
        # "movement@dt*sin",
        # "linear_movement@dt*cos",
        # "linear_movement@dt*sin",
        # "velocity@dt*cos",
        # "velocity@dt*sin",
    ]
)

In [11]:
# シーン内の集約特徴量
for c in features.num_features():
    train = train.with_columns(
        pl.col(c).mean().over("sceneID").alias(f"{c}_mean"),
        pl.col(c).std().over("sceneID").alias(f"{c}_std"),
        pl.col(c).max().over("sceneID").alias(f"{c}_max"),
        pl.col(c).min().over("sceneID").alias(f"{c}_min"),
    )
    features.add_num_features([f"{c}_mean", f"{c}_std", f"{c}_max", f"{c}_min"])

In [12]:
from const import CATEGORY_MAPPING

for c in features.num_features():
    train = train.with_columns(pl.col(c).cast(pl.Float32))

for c in features.cat_features():
    mapping = CATEGORY_MAPPING[c]
    train = train.with_columns(pl.col(c).replace_strict(mapping).cast(pl.Int32))

## CV Split


In [13]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
assert train["fold"].null_count() == 0

## Training


In [14]:
from penguinml.gbdt.xgboost import fit_xgb, inference_xgb
from tqdm import tqdm

dfs = []
for c in tqdm(sorted(TARGET_COLS, key=lambda x: x.split("_")[1], reverse=True)):
    print("==" * 10 + c + "==" * 10)
    this_df = train.filter(pl.col("target_name_original") == c)
    oof, models = fit_xgb(
        data=this_df,
        features=features,
        params=CFG[MODEL_NAME]["params"],
        target_col="target",
        fold_col="fold",
        target_type="regression",
        verbose=50000,
    )
    this_df = this_df.with_columns(pl.Series("oof", oof))
    dfs.append(this_df)
train = pl.concat(dfs)

  0%|          | 0/18 [00:00<?, ?it/s]

== fold 0 ==
[0]	validation_0-mae:17.66787
[1748]	validation_0-mae:0.69594
== fold 1 ==
[0]	validation_0-mae:17.69330
[2630]	validation_0-mae:0.72663
== fold 2 ==
[0]	validation_0-mae:17.90410
[3804]	validation_0-mae:0.71260
== fold 3 ==
[0]	validation_0-mae:17.77282
[2623]	validation_0-mae:0.70398
== fold 4 ==
[0]	validation_0-mae:17.38763
[2279]	validation_0-mae:0.71572


  6%|▌         | 1/18 [01:19<22:34, 79.65s/it]

== fold 0 ==
[0]	validation_0-mae:0.99784
[1597]	validation_0-mae:0.46579
== fold 1 ==
[0]	validation_0-mae:1.07323
[1729]	validation_0-mae:0.48548
== fold 2 ==
[0]	validation_0-mae:1.06209
[1664]	validation_0-mae:0.48569
== fold 3 ==
[0]	validation_0-mae:1.00584
[2655]	validation_0-mae:0.48122
== fold 4 ==
[0]	validation_0-mae:1.09607
[1657]	validation_0-mae:0.49961


 11%|█         | 2/18 [02:13<17:13, 64.61s/it]

== fold 0 ==
[0]	validation_0-mae:0.18292
[473]	validation_0-mae:0.17740
== fold 1 ==
[0]	validation_0-mae:0.18178
[681]	validation_0-mae:0.17457
== fold 2 ==
[0]	validation_0-mae:0.18551
[795]	validation_0-mae:0.17843
== fold 3 ==
[0]	validation_0-mae:0.18896
[777]	validation_0-mae:0.18310
== fold 4 ==
[0]	validation_0-mae:0.18486
[788]	validation_0-mae:0.17884


 17%|█▋        | 3/18 [02:35<11:18, 45.22s/it]

== fold 0 ==
[0]	validation_0-mae:14.67363
[3967]	validation_0-mae:0.52296
== fold 1 ==
[0]	validation_0-mae:14.69701
[2503]	validation_0-mae:0.53930
== fold 2 ==
[0]	validation_0-mae:14.86732
[5088]	validation_0-mae:0.53086
== fold 3 ==
[0]	validation_0-mae:14.76484
[3331]	validation_0-mae:0.52452
== fold 4 ==
[0]	validation_0-mae:14.43916
[3253]	validation_0-mae:0.53314


 22%|██▏       | 4/18 [04:19<15:57, 68.37s/it]

== fold 0 ==
[0]	validation_0-mae:0.72716
[2419]	validation_0-mae:0.32325
== fold 1 ==
[0]	validation_0-mae:0.77817
[1755]	validation_0-mae:0.33433
== fold 2 ==
[0]	validation_0-mae:0.77434
[1698]	validation_0-mae:0.33663
== fold 3 ==
[0]	validation_0-mae:0.72850
[2090]	validation_0-mae:0.33235
== fold 4 ==
[0]	validation_0-mae:0.79461
[1865]	validation_0-mae:0.34520


 28%|██▊       | 5/18 [05:18<14:04, 64.97s/it]

== fold 0 ==
[0]	validation_0-mae:0.14817
[683]	validation_0-mae:0.14396
== fold 1 ==
[0]	validation_0-mae:0.14718
[809]	validation_0-mae:0.14194
== fold 2 ==
[0]	validation_0-mae:0.15077
[435]	validation_0-mae:0.14561
== fold 3 ==
[0]	validation_0-mae:0.15403
[644]	validation_0-mae:0.14948
== fold 4 ==
[0]	validation_0-mae:0.15036
[829]	validation_0-mae:0.14538


 33%|███▎      | 6/18 [05:40<10:03, 50.26s/it]

== fold 0 ==
[0]	validation_0-mae:11.68011
[3463]	validation_0-mae:0.36888
== fold 1 ==
[0]	validation_0-mae:11.69991
[2270]	validation_0-mae:0.37380
== fold 2 ==
[0]	validation_0-mae:11.83293
[2718]	validation_0-mae:0.36994
== fold 3 ==
[0]	validation_0-mae:11.75277
[4264]	validation_0-mae:0.36635
== fold 4 ==
[0]	validation_0-mae:11.48912
[2331]	validation_0-mae:0.37293


 39%|███▉      | 7/18 [07:06<11:22, 62.04s/it]

== fold 0 ==
[0]	validation_0-mae:0.49626
[1950]	validation_0-mae:0.21119
== fold 1 ==
[0]	validation_0-mae:0.52888
[1544]	validation_0-mae:0.21731
== fold 2 ==
[0]	validation_0-mae:0.52808
[2413]	validation_0-mae:0.21833
== fold 3 ==
[0]	validation_0-mae:0.49421
[1647]	validation_0-mae:0.21704
== fold 4 ==
[0]	validation_0-mae:0.53921
[3097]	validation_0-mae:0.22368


 44%|████▍     | 8/18 [08:08<10:19, 61.92s/it]

== fold 0 ==
[0]	validation_0-mae:0.11537
[897]	validation_0-mae:0.11243
== fold 1 ==
[0]	validation_0-mae:0.11423
[496]	validation_0-mae:0.11040
== fold 2 ==
[0]	validation_0-mae:0.11769
[431]	validation_0-mae:0.11404
== fold 3 ==
[0]	validation_0-mae:0.12034
[805]	validation_0-mae:0.11701
== fold 4 ==
[0]	validation_0-mae:0.11718
[627]	validation_0-mae:0.11386


 50%|█████     | 9/18 [08:29<07:21, 49.06s/it]

== fold 0 ==
[0]	validation_0-mae:8.68596
[2811]	validation_0-mae:0.23631
== fold 1 ==
[0]	validation_0-mae:8.70019
[2196]	validation_0-mae:0.23757
== fold 2 ==
[0]	validation_0-mae:8.79898
[1804]	validation_0-mae:0.23564
== fold 3 ==
[0]	validation_0-mae:8.74252
[4585]	validation_0-mae:0.23667
== fold 4 ==
[0]	validation_0-mae:8.54055
[1837]	validation_0-mae:0.24001


 56%|█████▌    | 10/18 [09:46<07:42, 57.84s/it]

== fold 0 ==
[0]	validation_0-mae:0.30688
[1898]	validation_0-mae:0.12912
== fold 1 ==
[0]	validation_0-mae:0.32413
[1890]	validation_0-mae:0.13041
== fold 2 ==
[0]	validation_0-mae:0.32518
[2614]	validation_0-mae:0.13334
== fold 3 ==
[0]	validation_0-mae:0.30338
[1453]	validation_0-mae:0.13180
== fold 4 ==
[0]	validation_0-mae:0.33145
[1665]	validation_0-mae:0.13522


 61%|██████    | 11/18 [10:43<06:42, 57.43s/it]

== fold 0 ==
[0]	validation_0-mae:0.08426
[352]	validation_0-mae:0.08255
== fold 1 ==
[0]	validation_0-mae:0.08309
[593]	validation_0-mae:0.08079
== fold 2 ==
[0]	validation_0-mae:0.08595
[457]	validation_0-mae:0.08374
== fold 3 ==
[0]	validation_0-mae:0.08784
[950]	validation_0-mae:0.08549
== fold 4 ==
[0]	validation_0-mae:0.08493
[440]	validation_0-mae:0.08281


 67%|██████▋   | 12/18 [11:01<04:33, 45.51s/it]

== fold 0 ==
[0]	validation_0-mae:5.69131
[1228]	validation_0-mae:0.13827
== fold 1 ==
[0]	validation_0-mae:5.70208
[1557]	validation_0-mae:0.13917
== fold 2 ==
[0]	validation_0-mae:5.76656
[1493]	validation_0-mae:0.14054
== fold 3 ==
[0]	validation_0-mae:5.72882
[2765]	validation_0-mae:0.13850
== fold 4 ==
[0]	validation_0-mae:5.59519
[2144]	validation_0-mae:0.13917


 72%|███████▏  | 13/18 [11:56<04:01, 48.33s/it]

== fold 0 ==
[0]	validation_0-mae:0.16016
[2099]	validation_0-mae:0.07271
== fold 1 ==
[0]	validation_0-mae:0.16662
[1625]	validation_0-mae:0.07102
== fold 2 ==
[0]	validation_0-mae:0.16934
[1712]	validation_0-mae:0.07405
== fold 3 ==
[0]	validation_0-mae:0.15773
[1904]	validation_0-mae:0.07329
== fold 4 ==
[0]	validation_0-mae:0.17199
[1789]	validation_0-mae:0.07473


 78%|███████▊  | 14/18 [12:46<03:16, 49.04s/it]

== fold 0 ==
[0]	validation_0-mae:0.05457
[352]	validation_0-mae:0.05366
== fold 1 ==
[0]	validation_0-mae:0.05381
[488]	validation_0-mae:0.05260
== fold 2 ==
[0]	validation_0-mae:0.05571
[548]	validation_0-mae:0.05445
== fold 3 ==
[0]	validation_0-mae:0.05646
[347]	validation_0-mae:0.05528
== fold 4 ==
[0]	validation_0-mae:0.05474
[507]	validation_0-mae:0.05359


 83%|████████▎ | 15/18 [13:01<01:56, 38.73s/it]

== fold 0 ==
[0]	validation_0-mae:2.69628
[1343]	validation_0-mae:0.06444
== fold 1 ==
[0]	validation_0-mae:2.70247
[1997]	validation_0-mae:0.06337
== fold 2 ==
[0]	validation_0-mae:2.73223
[1393]	validation_0-mae:0.06607
== fold 3 ==
[0]	validation_0-mae:2.71474
[1313]	validation_0-mae:0.06478
== fold 4 ==
[0]	validation_0-mae:2.65160
[1407]	validation_0-mae:0.06476


 89%|████████▉ | 16/18 [13:44<01:19, 39.91s/it]

== fold 0 ==
[0]	validation_0-mae:0.05734
[1631]	validation_0-mae:0.03201
== fold 1 ==
[0]	validation_0-mae:0.05893
[1674]	validation_0-mae:0.03135
== fold 2 ==
[0]	validation_0-mae:0.06033
[1554]	validation_0-mae:0.03257
== fold 3 ==
[0]	validation_0-mae:0.05632
[1170]	validation_0-mae:0.03232
== fold 4 ==
[0]	validation_0-mae:0.06154
[1872]	validation_0-mae:0.03321


 94%|█████████▍| 17/18 [14:31<00:41, 41.99s/it]

== fold 0 ==
[0]	validation_0-mae:0.02616
[334]	validation_0-mae:0.02577
== fold 1 ==
[0]	validation_0-mae:0.02584
[540]	validation_0-mae:0.02540
== fold 2 ==
[0]	validation_0-mae:0.02683
[432]	validation_0-mae:0.02635
== fold 3 ==
[0]	validation_0-mae:0.02686
[424]	validation_0-mae:0.02639
== fold 4 ==
[0]	validation_0-mae:0.02610
[508]	validation_0-mae:0.02560


100%|██████████| 18/18 [14:45<00:00, 49.17s/it]


In [15]:
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.2180115893876533


In [16]:
rev_dict = {v: "oof_" + k for k, v in CATEGORY_MAPPING["target_name"].items()}
oof_df = (
    train.select(["ID", "target_name", "oof"])
    .with_columns(pl.col("target_name").replace_strict(rev_dict))
    .pivot(index="ID", columns="target_name", values="oof")
)
oof_df.head()

ID,oof_x_5,oof_y_5,oof_z_5,oof_x_4,oof_y_4,oof_z_4,oof_x_3,oof_y_3,oof_z_3,oof_x_2,oof_y_2,oof_z_2,oof_x_1,oof_y_1,oof_z_1,oof_x_0,oof_y_0,oof_z_0
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""00066be8e20318869c38c66be46663…",22.464357,-0.4018,0.188004,18.046217,-0.277361,0.119448,13.619493,-0.124755,0.101725,9.590807,-0.051544,0.042231,5.902345,-0.014749,0.019444,2.634666,0.00116,0.010888
"""00066be8e20318869c38c66be46663…",31.490473,-0.665967,0.121405,26.373646,-0.819956,0.146707,21.2544,-0.593322,0.113506,15.86466,-0.415264,0.057983,10.524738,-0.202545,0.029712,4.987913,-0.052315,0.017456
"""00066be8e20318869c38c66be46663…",32.024368,-0.366316,0.08195,26.741051,-0.280077,0.122031,21.160831,-0.168629,0.096127,15.562232,-0.116979,0.058289,10.162181,-0.053695,0.028867,4.733961,-0.014774,0.016387
"""000fb056f97572d384bae4f5fc1e0f…",15.165918,-0.864976,0.060343,11.628304,-0.617608,0.035401,9.022715,-0.43137,0.033625,6.327863,-0.243893,0.011998,3.859671,-0.135009,0.005838,1.613916,-0.050572,0.001827
"""000fb056f97572d384bae4f5fc1e0f…",18.061884,1.006505,-0.03569,14.987482,0.718921,-0.023693,11.832363,0.500763,-0.00857,8.563321,0.28102,-0.005479,5.775146,0.113057,-0.002916,2.759885,0.02393,-0.001398


In [17]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))