In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

rm: cannot remove '/kaggle/working/*': No such file or directory
/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "xgboost"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-16 09:27:04.641922: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-16 09:27:04.670105: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_005


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import warnings

import numpy as np
import polars as pl

warnings.filterwarnings("ignore")

In [5]:
train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(pl.col("offset").cast(pl.Float32))
)
print(train.shape)
train.head(1)

(43371, 32)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0


## 特徴量生成


In [6]:
from penguinml.utils.contena import FeatureContena

features = FeatureContena()
basic_features = [
    "vEgo",
    "aEgo",
    "steeringAngleDeg",
    "steeringTorque",
    "brake",
    "brakePressed",
    "gas",
    "gasPressed",
    "leftBlinker",
    "rightBlinker",
    "offset",
]
for c in basic_features:
    train = train.with_columns(pl.col(c).cast(pl.Float32))
features.add_num_features(basic_features)
features.add_cat_features(["gearShifter"])

In [None]:
# 前後のフレームとのラグ特徴量
train = train.sort(["sceneID", "offset"])
for c in basic_features:
    for diff in [-1, 1]:
        train = train.with_columns(
            pl.col(c).diff(n=diff).over("sceneID").alias(f"{c}_diff_{diff}"),
            pl.col(c).diff(n=diff).over("sceneID").alias(f"{c}_shift_{diff}"),
        )
        features.add_num_features([f"{c}_diff_{diff}", f"{c}_shift_{diff}"])

In [8]:
# train = train.with_columns(
#     (pl.col("vEgo") / pl.col("aEgo")).alias("vEgo/aEgo"),
# )
# features.add_num_features(["vEgo/aEgo"])

## ターゲット列を分解


In [9]:
from const import TARGET_COLS

train = (
    train.unpivot(index="ID", on=TARGET_COLS, variable_name="target_name", value_name="target")
    .join(
        train.drop(TARGET_COLS),
        on="ID",
        how="left",
    )
    .with_columns(
        (
            pl.col("target_name").map_elements(lambda x: float(x.split("_")[1]), return_dtype=pl.Float32) * 0.5 + 0.5
        ).alias("dt"),
        pl.col("target_name").map_elements(lambda x: x.split("_")[0], return_dtype=str).alias("xyz"),
    )
    .with_columns(
        pl.col("target_name").alias("target_name_original"),
    )
)
features.add_cat_features(["target_name", "xyz"])
train.head(1)

ID,target_name,target,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,sceneID,offset,vEgo_diff_-1,vEgo_shift_-1,vEgo_diff_1,vEgo_shift_1,aEgo_diff_-1,aEgo_shift_-1,aEgo_diff_1,aEgo_shift_1,steeringAngleDeg_diff_-1,steeringAngleDeg_shift_-1,steeringAngleDeg_diff_1,steeringAngleDeg_shift_1,steeringTorque_diff_-1,steeringTorque_shift_-1,steeringTorque_diff_1,steeringTorque_shift_1,brake_diff_-1,brake_shift_-1,brake_diff_1,brake_shift_1,brakePressed_diff_-1,brakePressed_shift_-1,brakePressed_diff_1,brakePressed_shift_1,gas_diff_-1,gas_shift_-1,gas_diff_1,gas_shift_1,gasPressed_diff_-1,gasPressed_shift_-1,gasPressed_diff_1,gasPressed_shift_1,leftBlinker_diff_-1,leftBlinker_shift_-1,leftBlinker_diff_1,leftBlinker_shift_1,rightBlinker_diff_-1,rightBlinker_shift_-1,rightBlinker_diff_1,rightBlinker_shift_1,offset_diff_-1,offset_shift_-1,offset_diff_1,offset_shift_1,dt,xyz,target_name_original
str,str,f64,f32,f32,f32,f32,f32,f32,f32,f32,str,f32,f32,str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,str
"""00066be8e20318869c38c66be46663…","""x_0""",2.82959,5.701526,1.538456,-2.165777,-139.0,0.0,0.0,0.25,1.0,"""drive""",0.0,0.0,"""00066be8e20318869c38c66be46663…",320.0,-5.474767,-5.474767,,,1.258575,1.258575,,,9.45992,9.45992,,,-95.0,-95.0,,,0.0,0.0,,,0.0,0.0,,,0.25,0.25,,,1.0,1.0,,,0.0,0.0,,,-1.0,-1.0,,,-100.0,-100.0,,,0.5,"""x""","""x_0"""


In [10]:
# dt秒後の特徴
train = train.with_columns(
    # vt
    (pl.col("vEgo") * pl.col("dt").cast(pl.Float32)).alias("linear_movement@dt"),
    # vt + 0.5at^2
    ((pl.col("vEgo") + 0.5 * pl.col("aEgo") * pl.col("dt").cast(pl.Float32) ** 2).alias("movement@dt")),
    # v + at
    (pl.col("vEgo") + pl.col("aEgo") * pl.col("dt").cast(pl.Float32)).alias("velocity@dt"),
    # # cos
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.cos(np.deg2rad(x)), return_dtype=pl.Float32)).alias("cos"),
    # # sin
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.sin(np.deg2rad(x)), return_dtype=pl.Float32)).alias("sin"),
).with_columns(
    # (pl.col("movement@dt") * pl.col("cos")).alias("movement@dt*cos"),
    # (pl.col("movement@dt") * pl.col("sin")).alias("movement@dt*sin"),
    # (pl.col("linear_movement@dt") * pl.col("cos")).alias("linear_movement@dt*cos"),
    # (pl.col("linear_movement@dt") * pl.col("sin")).alias("linear_movement@dt*sin"),
    # (pl.col("velocity@dt") * pl.col("cos")).alias("velocity@dt*cos"),
    # (pl.col("velocity@dt") * pl.col("sin")).alias("velocity@dt*sin"),
)

features.add_num_features(
    [
        "linear_movement@dt",
        "movement@dt",
        "velocity@dt",
        # "cos",
        # "sin",
        # "movement@dt*cos",
        # "movement@dt*sin",
        # "linear_movement@dt*cos",
        # "linear_movement@dt*sin",
        # "velocity@dt*cos",
        # "velocity@dt*sin",
    ]
)

In [11]:
# シーン内の集約特徴量
for c in features.num_features():
    train = train.with_columns(
        pl.col(c).mean().over("sceneID").alias(f"{c}_mean"),
        pl.col(c).std().over("sceneID").alias(f"{c}_std"),
        pl.col(c).max().over("sceneID").alias(f"{c}_max"),
        pl.col(c).min().over("sceneID").alias(f"{c}_min"),
    )
    features.add_num_features([f"{c}_mean", f"{c}_std", f"{c}_max", f"{c}_min"])

In [12]:
from const import CATEGORY_MAPPING

for c in features.num_features():
    train = train.with_columns(pl.col(c).cast(pl.Float32))

for c in features.cat_features():
    mapping = CATEGORY_MAPPING[c]
    train = train.with_columns(pl.col(c).replace_strict(mapping).cast(pl.Int32))

## CV Split


In [13]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
assert train["fold"].null_count() == 0

## Training


In [14]:
from penguinml.gbdt.xgboost import fit_xgb, inference_xgb
from tqdm import tqdm

dfs = []
for c in tqdm(sorted(TARGET_COLS, key=lambda x: x.split("_")[1], reverse=True)):
    print("==" * 10 + c + "==" * 10)
    this_df = train.filter(pl.col("target_name_original") == c)
    oof, models = fit_xgb(
        data=this_df,
        features=features,
        params=CFG[MODEL_NAME]["params"],
        target_col="target",
        fold_col="fold",
        target_type="regression",
        verbose=50000,
    )
    this_df = this_df.with_columns(pl.Series("oof", oof))
    dfs.append(this_df)
train = pl.concat(dfs)

  0%|          | 0/18 [00:00<?, ?it/s]

== fold 0 ==
[0]	validation_0-mae:17.66778
[1895]	validation_0-mae:0.70401
== fold 1 ==
[0]	validation_0-mae:17.69517
[5535]	validation_0-mae:0.73028
== fold 2 ==
[0]	validation_0-mae:17.90453
[2608]	validation_0-mae:0.71290
== fold 3 ==
[0]	validation_0-mae:17.77304
[3810]	validation_0-mae:0.70447
== fold 4 ==
[0]	validation_0-mae:17.38742
[2172]	validation_0-mae:0.71601


  6%|▌         | 1/18 [01:42<29:05, 102.66s/it]

== fold 0 ==
[0]	validation_0-mae:0.99735
[2055]	validation_0-mae:0.46758
== fold 1 ==
[0]	validation_0-mae:1.07295
[1731]	validation_0-mae:0.48668
== fold 2 ==
[0]	validation_0-mae:1.06187
[1978]	validation_0-mae:0.48810
== fold 3 ==
[0]	validation_0-mae:1.00575
[2095]	validation_0-mae:0.48441
== fold 4 ==
[0]	validation_0-mae:1.09593
[2027]	validation_0-mae:0.50476


 11%|█         | 2/18 [02:45<21:09, 79.35s/it] 

== fold 0 ==
[0]	validation_0-mae:0.18287
[596]	validation_0-mae:0.17701
== fold 1 ==
[0]	validation_0-mae:0.18177
[686]	validation_0-mae:0.17385
== fold 2 ==
[0]	validation_0-mae:0.18548
[576]	validation_0-mae:0.17827
== fold 3 ==
[0]	validation_0-mae:0.18892
[466]	validation_0-mae:0.18261
== fold 4 ==
[0]	validation_0-mae:0.18483
[771]	validation_0-mae:0.17858


 17%|█▋        | 3/18 [03:07<13:18, 53.23s/it]

== fold 0 ==
[0]	validation_0-mae:14.67618
[2323]	validation_0-mae:0.52349
== fold 1 ==
[0]	validation_0-mae:14.69742
[1592]	validation_0-mae:0.54280
== fold 2 ==
[0]	validation_0-mae:14.86726
[1239]	validation_0-mae:0.53287
== fold 3 ==
[0]	validation_0-mae:14.76400
[3341]	validation_0-mae:0.52501
== fold 4 ==
[0]	validation_0-mae:14.43963
[2350]	validation_0-mae:0.53366


 22%|██▏       | 4/18 [04:20<14:11, 60.80s/it]

== fold 0 ==
[0]	validation_0-mae:0.72769
[1552]	validation_0-mae:0.32610
== fold 1 ==
[0]	validation_0-mae:0.77965
[3120]	validation_0-mae:0.33300
== fold 2 ==
[0]	validation_0-mae:0.77476
[1400]	validation_0-mae:0.33933
== fold 3 ==
[0]	validation_0-mae:0.72906
[2408]	validation_0-mae:0.33388
== fold 4 ==
[0]	validation_0-mae:0.79535
[1676]	validation_0-mae:0.34648


 28%|██▊       | 5/18 [05:29<13:52, 64.01s/it]

== fold 0 ==
[0]	validation_0-mae:0.14811
[691]	validation_0-mae:0.14367
== fold 1 ==
[0]	validation_0-mae:0.14714
[674]	validation_0-mae:0.14117
== fold 2 ==
[0]	validation_0-mae:0.15075
[873]	validation_0-mae:0.14517
== fold 3 ==
[0]	validation_0-mae:0.15399
[677]	validation_0-mae:0.14911
== fold 4 ==
[0]	validation_0-mae:0.15034
[805]	validation_0-mae:0.14519


 33%|███▎      | 6/18 [05:57<10:18, 51.56s/it]

== fold 0 ==
[0]	validation_0-mae:11.68096
[2559]	validation_0-mae:0.36533
== fold 1 ==
[0]	validation_0-mae:11.69995
[2397]	validation_0-mae:0.37446
== fold 2 ==
[0]	validation_0-mae:11.83326
[1981]	validation_0-mae:0.36907
== fold 3 ==
[0]	validation_0-mae:11.75288
[3807]	validation_0-mae:0.36548
== fold 4 ==
[0]	validation_0-mae:11.49014
[2062]	validation_0-mae:0.37295


 39%|███▉      | 7/18 [07:20<11:20, 61.89s/it]

== fold 0 ==
[0]	validation_0-mae:0.49661
[2010]	validation_0-mae:0.21205
== fold 1 ==
[0]	validation_0-mae:0.52882
[1683]	validation_0-mae:0.21776
== fold 2 ==
[0]	validation_0-mae:0.52847
[1548]	validation_0-mae:0.22016
== fold 3 ==
[0]	validation_0-mae:0.49444
[1473]	validation_0-mae:0.21790
== fold 4 ==
[0]	validation_0-mae:0.54011
[1757]	validation_0-mae:0.22390


 44%|████▍     | 8/18 [08:16<09:59, 59.96s/it]

== fold 0 ==
[0]	validation_0-mae:0.11533
[460]	validation_0-mae:0.11209
== fold 1 ==
[0]	validation_0-mae:0.11418
[521]	validation_0-mae:0.10985
== fold 2 ==
[0]	validation_0-mae:0.11767
[474]	validation_0-mae:0.11366
== fold 3 ==
[0]	validation_0-mae:0.12030
[641]	validation_0-mae:0.11651
== fold 4 ==
[0]	validation_0-mae:0.11713
[362]	validation_0-mae:0.11371


 50%|█████     | 9/18 [08:35<07:03, 47.11s/it]

== fold 0 ==
[0]	validation_0-mae:8.68598
[1667]	validation_0-mae:0.23298
== fold 1 ==
[0]	validation_0-mae:8.70024
[2581]	validation_0-mae:0.23865
== fold 2 ==
[0]	validation_0-mae:8.79932
[3435]	validation_0-mae:0.23523
== fold 3 ==
[0]	validation_0-mae:8.74253
[2187]	validation_0-mae:0.23697
== fold 4 ==
[0]	validation_0-mae:8.54056
[1725]	validation_0-mae:0.23952


 56%|█████▌    | 10/18 [09:50<07:25, 55.70s/it]

== fold 0 ==
[0]	validation_0-mae:0.30712
[1872]	validation_0-mae:0.12904
== fold 1 ==
[0]	validation_0-mae:0.32426
[2012]	validation_0-mae:0.13055
== fold 2 ==
[0]	validation_0-mae:0.32630
[1731]	validation_0-mae:0.13390
== fold 3 ==
[0]	validation_0-mae:0.30386
[1586]	validation_0-mae:0.13224
== fold 4 ==
[0]	validation_0-mae:0.33216
[1800]	validation_0-mae:0.13598


 61%|██████    | 11/18 [10:53<06:45, 57.97s/it]

== fold 0 ==
[0]	validation_0-mae:0.08423
[401]	validation_0-mae:0.08225
== fold 1 ==
[0]	validation_0-mae:0.08306
[597]	validation_0-mae:0.08014
== fold 2 ==
[0]	validation_0-mae:0.08593
[491]	validation_0-mae:0.08332
== fold 3 ==
[0]	validation_0-mae:0.08781
[832]	validation_0-mae:0.08522
== fold 4 ==
[0]	validation_0-mae:0.08491
[670]	validation_0-mae:0.08253


 67%|██████▋   | 12/18 [11:15<04:42, 47.02s/it]

== fold 0 ==
[0]	validation_0-mae:5.69131
[1527]	validation_0-mae:0.13818
== fold 1 ==
[0]	validation_0-mae:5.70207
[1754]	validation_0-mae:0.13845
== fold 2 ==
[0]	validation_0-mae:5.76657
[1291]	validation_0-mae:0.13919
== fold 3 ==
[0]	validation_0-mae:5.72881
[2013]	validation_0-mae:0.13818
== fold 4 ==
[0]	validation_0-mae:5.59519
[2531]	validation_0-mae:0.13929


 72%|███████▏  | 13/18 [12:20<04:22, 52.58s/it]

== fold 0 ==
[0]	validation_0-mae:0.16012
[1495]	validation_0-mae:0.07266
== fold 1 ==
[0]	validation_0-mae:0.16660
[1514]	validation_0-mae:0.07166
== fold 2 ==
[0]	validation_0-mae:0.16936
[1642]	validation_0-mae:0.07423
== fold 3 ==
[0]	validation_0-mae:0.15779
[1650]	validation_0-mae:0.07331
== fold 4 ==
[0]	validation_0-mae:0.17225
[1598]	validation_0-mae:0.07508


 78%|███████▊  | 14/18 [13:12<03:30, 52.51s/it]

== fold 0 ==
[0]	validation_0-mae:0.05454
[358]	validation_0-mae:0.05341
== fold 1 ==
[0]	validation_0-mae:0.05378
[466]	validation_0-mae:0.05214
== fold 2 ==
[0]	validation_0-mae:0.05570
[701]	validation_0-mae:0.05419
== fold 3 ==
[0]	validation_0-mae:0.05644
[1202]	validation_0-mae:0.05489
== fold 4 ==
[0]	validation_0-mae:0.05472
[596]	validation_0-mae:0.05335


 83%|████████▎ | 15/18 [13:37<02:12, 44.02s/it]

== fold 0 ==
[0]	validation_0-mae:2.69628
[1260]	validation_0-mae:0.06355
== fold 1 ==
[0]	validation_0-mae:2.70247
[1331]	validation_0-mae:0.06334
== fold 2 ==
[0]	validation_0-mae:2.73223
[1649]	validation_0-mae:0.06575
== fold 3 ==
[0]	validation_0-mae:2.71458
[1776]	validation_0-mae:0.06487
== fold 4 ==
[0]	validation_0-mae:2.65165
[1849]	validation_0-mae:0.06465


 89%|████████▉ | 16/18 [14:28<01:32, 46.10s/it]

== fold 0 ==
[0]	validation_0-mae:0.05734
[1563]	validation_0-mae:0.03171
== fold 1 ==
[0]	validation_0-mae:0.05896
[786]	validation_0-mae:0.03160
== fold 2 ==
[0]	validation_0-mae:0.06033
[1310]	validation_0-mae:0.03235
== fold 3 ==
[0]	validation_0-mae:0.05630
[1614]	validation_0-mae:0.03187
== fold 4 ==
[0]	validation_0-mae:0.06149
[1337]	validation_0-mae:0.03315


 94%|█████████▍| 17/18 [15:13<00:45, 45.89s/it]

== fold 0 ==
[0]	validation_0-mae:0.02615
[401]	validation_0-mae:0.02559
== fold 1 ==
[0]	validation_0-mae:0.02583
[475]	validation_0-mae:0.02514
== fold 2 ==
[0]	validation_0-mae:0.02682
[376]	validation_0-mae:0.02618
== fold 3 ==
[0]	validation_0-mae:0.02685
[683]	validation_0-mae:0.02625
== fold 4 ==
[0]	validation_0-mae:0.02609
[454]	validation_0-mae:0.02551


100%|██████████| 18/18 [15:31<00:00, 51.73s/it]


In [15]:
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.21828784498770645


In [16]:
rev_dict = {v: "oof_" + k for k, v in CATEGORY_MAPPING["target_name"].items()}
oof_df = (
    train.select(["ID", "target_name", "oof"])
    .with_columns(pl.col("target_name").replace_strict(rev_dict))
    .pivot(index="ID", columns="target_name", values="oof")
)
oof_df.head()

ID,oof_x_5,oof_y_5,oof_z_5,oof_x_4,oof_y_4,oof_z_4,oof_x_3,oof_y_3,oof_z_3,oof_x_2,oof_y_2,oof_z_2,oof_x_1,oof_y_1,oof_z_1,oof_x_0,oof_y_0,oof_z_0
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""00066be8e20318869c38c66be46663…",21.870636,-0.40243,0.140438,18.210474,-0.282908,0.133933,13.613016,-0.100169,0.067414,9.795164,-0.071299,0.052183,5.879181,-0.033015,0.023066,2.645556,-0.002747,0.016497
"""00066be8e20318869c38c66be46663…",31.578716,-0.569545,0.12726,26.52356,-0.576579,0.152131,21.183722,-0.427999,0.091575,15.8291,-0.386211,0.053785,10.585698,-0.235717,0.036184,5.036321,-0.069186,0.023402
"""00066be8e20318869c38c66be46663…",31.899765,-0.341625,0.100391,26.644663,-0.390511,0.1468,21.071726,-0.178021,0.078273,15.45903,-0.147338,0.039608,10.14452,-0.074818,0.026445,4.71555,-0.023173,0.013426
"""000fb056f97572d384bae4f5fc1e0f…",14.755453,-0.692768,0.056329,11.750469,-0.56795,0.045361,9.07926,-0.342746,0.023428,6.328568,-0.299098,0.016049,3.768057,-0.12705,0.009206,1.601007,-0.049232,0.003031
"""000fb056f97572d384bae4f5fc1e0f…",17.830925,1.074196,-0.000891,14.914308,0.789287,-0.007057,11.813318,0.491941,-0.000635,8.814574,0.264492,0.002926,5.745161,0.107982,0.008025,2.719719,0.020252,0.001921


In [17]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))