In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "xgboost"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-16 09:00:26.884713: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-16 09:00:26.911527: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_004


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import warnings

import numpy as np
import polars as pl

warnings.filterwarnings("ignore")

In [5]:
train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(pl.col("offset").cast(pl.Float32))
)
print(train.shape)
train.head(1)

(43371, 32)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0


## 特徴量生成


In [6]:
from penguinml.utils.contena import FeatureContena

features = FeatureContena()
features.add_num_features(
    [
        "vEgo",
        "aEgo",
        "steeringAngleDeg",
        "steeringTorque",
        "brake",
        "brakePressed",
        "gas",
        "gasPressed",
        "leftBlinker",
        "rightBlinker",
        "offset",
    ]
)
features.add_cat_features(["gearShifter"])

In [7]:
# train = train.with_columns(
#     (pl.col("vEgo") / pl.col("aEgo")).alias("vEgo/aEgo"),
# )
# features.add_num_features(["vEgo/aEgo"])

## ターゲット列を分解


In [8]:
from const import TARGET_COLS

train = (
    train.unpivot(index="ID", on=TARGET_COLS, variable_name="target_name", value_name="target")
    .join(
        train.drop(TARGET_COLS),
        on="ID",
        how="left",
    )
    .with_columns(
        (
            pl.col("target_name").map_elements(lambda x: float(x.split("_")[1]), return_dtype=pl.Float32) * 0.5 + 0.5
        ).alias("dt"),
        pl.col("target_name").map_elements(lambda x: x.split("_")[0], return_dtype=str).alias("xyz"),
    )
    .with_columns(
        pl.col("target_name").alias("target_name_original"),
    )
)
features.add_cat_features(["target_name", "xyz"])
train.head(1)

ID,target_name,target,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,sceneID,offset,dt,xyz,target_name_original
str,str,f64,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,str,f32,f32,str,str
"""00066be8e20318869c38c66be46663…","""x_0""",2.82959,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,"""00066be8e20318869c38c66be46663…",320.0,0.5,"""x""","""x_0"""


In [9]:
# dt秒後の特徴
train = train.with_columns(
    # vt
    (pl.col("vEgo") * pl.col("dt").cast(pl.Float32)).alias("linear_movement@dt"),
    # vt + 0.5at^2
    ((pl.col("vEgo") + 0.5 * pl.col("aEgo") * pl.col("dt").cast(pl.Float32) ** 2).alias("movement@dt")),
    # v + at
    (pl.col("vEgo") + pl.col("aEgo") * pl.col("dt").cast(pl.Float32)).alias("velocity@dt"),
    # # cos
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.cos(np.deg2rad(x)), return_dtype=pl.Float32)).alias("cos"),
    # # sin
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.sin(np.deg2rad(x)), return_dtype=pl.Float32)).alias("sin"),
).with_columns(
    # (pl.col("movement@dt") * pl.col("cos")).alias("movement@dt*cos"),
    # (pl.col("movement@dt") * pl.col("sin")).alias("movement@dt*sin"),
    # (pl.col("linear_movement@dt") * pl.col("cos")).alias("linear_movement@dt*cos"),
    # (pl.col("linear_movement@dt") * pl.col("sin")).alias("linear_movement@dt*sin"),
    # (pl.col("velocity@dt") * pl.col("cos")).alias("velocity@dt*cos"),
    # (pl.col("velocity@dt") * pl.col("sin")).alias("velocity@dt*sin"),
)

features.add_num_features(
    [
        "linear_movement@dt",
        "movement@dt",
        "velocity@dt",
        # "cos",
        # "sin",
        # "movement@dt*cos",
        # "movement@dt*sin",
        # "linear_movement@dt*cos",
        # "linear_movement@dt*sin",
        # "velocity@dt*cos",
        # "velocity@dt*sin",
    ]
)

In [10]:
# シーン内の集約特徴量
for c in features.num_features():
    train = train.with_columns(
        pl.col(c).mean().over("sceneID").alias(f"{c}_mean"),
        pl.col(c).std().over("sceneID").alias(f"{c}_std"),
        pl.col(c).max().over("sceneID").alias(f"{c}_max"),
        pl.col(c).min().over("sceneID").alias(f"{c}_min"),
    )
    features.add_num_features([f"{c}_mean", f"{c}_std", f"{c}_max", f"{c}_min"])

In [11]:
from const import CATEGORY_MAPPING

for c in features.num_features():
    train = train.with_columns(pl.col(c).cast(pl.Float32))

for c in features.cat_features():
    mapping = CATEGORY_MAPPING[c]
    train = train.with_columns(pl.col(c).replace_strict(mapping).cast(pl.Int32))

## CV Split


In [12]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
assert train["fold"].null_count() == 0

## Training


In [13]:
from penguinml.gbdt.xgboost import fit_xgb, inference_xgb
from tqdm import tqdm

dfs = []
for c in tqdm(sorted(TARGET_COLS, key=lambda x: x.split("_")[1], reverse=True)):
    print("==" * 10 + c + "==" * 10)
    this_df = train.filter(pl.col("target_name_original") == c)
    oof, models = fit_xgb(
        data=this_df,
        features=features,
        params=CFG[MODEL_NAME]["params"],
        target_col="target",
        fold_col="fold",
        target_type="regression",
        verbose=50000,
    )
    this_df = this_df.with_columns(pl.Series("oof", oof))
    dfs.append(this_df)
train = pl.concat(dfs)

  0%|          | 0/18 [00:00<?, ?it/s]

== fold 0 ==
[0]	validation_0-mae:17.67537
[3691]	validation_0-mae:0.81995
== fold 1 ==
[0]	validation_0-mae:17.69783
[2773]	validation_0-mae:0.84933
== fold 2 ==
[0]	validation_0-mae:17.91073
[3519]	validation_0-mae:0.84069
== fold 3 ==
[0]	validation_0-mae:17.77506
[1654]	validation_0-mae:0.83085
== fold 4 ==
[0]	validation_0-mae:17.39640
[4027]	validation_0-mae:0.84291


  6%|▌         | 1/18 [01:11<20:10, 71.19s/it]

== fold 0 ==
[0]	validation_0-mae:0.99782
[1683]	validation_0-mae:0.47558
== fold 1 ==
[0]	validation_0-mae:1.07299
[1600]	validation_0-mae:0.49444
== fold 2 ==
[0]	validation_0-mae:1.06145
[1587]	validation_0-mae:0.49344
== fold 3 ==
[0]	validation_0-mae:1.00633
[2050]	validation_0-mae:0.48799
== fold 4 ==
[0]	validation_0-mae:1.09491
[2210]	validation_0-mae:0.50344


 11%|█         | 2/18 [01:55<14:43, 55.21s/it]

== fold 0 ==
[0]	validation_0-mae:0.18290
[383]	validation_0-mae:0.17759
== fold 1 ==
[0]	validation_0-mae:0.18177
[456]	validation_0-mae:0.17505
== fold 2 ==
[0]	validation_0-mae:0.18544
[448]	validation_0-mae:0.17912
== fold 3 ==
[0]	validation_0-mae:0.18894
[432]	validation_0-mae:0.18332
== fold 4 ==
[0]	validation_0-mae:0.18484
[380]	validation_0-mae:0.17905


 17%|█▋        | 3/18 [02:06<08:49, 35.28s/it]

== fold 0 ==
[0]	validation_0-mae:14.68608
[4487]	validation_0-mae:0.57647
== fold 1 ==
[0]	validation_0-mae:14.70070
[3982]	validation_0-mae:0.59144
== fold 2 ==
[0]	validation_0-mae:14.87083
[3169]	validation_0-mae:0.59043
== fold 3 ==
[0]	validation_0-mae:14.76740
[3249]	validation_0-mae:0.58265
== fold 4 ==
[0]	validation_0-mae:14.44664
[4408]	validation_0-mae:0.59365


 22%|██▏       | 4/18 [03:35<13:09, 56.38s/it]

== fold 0 ==
[0]	validation_0-mae:0.72713
[1828]	validation_0-mae:0.32710
== fold 1 ==
[0]	validation_0-mae:0.77953
[1446]	validation_0-mae:0.33787
== fold 2 ==
[0]	validation_0-mae:0.77445
[1281]	validation_0-mae:0.33937
== fold 3 ==
[0]	validation_0-mae:0.72922
[1941]	validation_0-mae:0.33529
== fold 4 ==
[0]	validation_0-mae:0.79549
[1830]	validation_0-mae:0.34636


 28%|██▊       | 5/18 [04:15<10:55, 50.41s/it]

== fold 0 ==
[0]	validation_0-mae:0.14814
[454]	validation_0-mae:0.14429
== fold 1 ==
[0]	validation_0-mae:0.14715
[541]	validation_0-mae:0.14203
== fold 2 ==
[0]	validation_0-mae:0.15076
[468]	validation_0-mae:0.14589
== fold 3 ==
[0]	validation_0-mae:0.15401
[839]	validation_0-mae:0.14947
== fold 4 ==
[0]	validation_0-mae:0.15034
[412]	validation_0-mae:0.14609


 33%|███▎      | 6/18 [04:29<07:37, 38.15s/it]

== fold 0 ==
[0]	validation_0-mae:11.69217
[1768]	validation_0-mae:0.38390
== fold 1 ==
[0]	validation_0-mae:11.70499
[3265]	validation_0-mae:0.39474
== fold 2 ==
[0]	validation_0-mae:11.84051
[3307]	validation_0-mae:0.38953
== fold 3 ==
[0]	validation_0-mae:11.76079
[5185]	validation_0-mae:0.38773
== fold 4 ==
[0]	validation_0-mae:11.49621
[2864]	validation_0-mae:0.39457


 39%|███▉      | 7/18 [05:44<09:13, 50.28s/it]

== fold 0 ==
[0]	validation_0-mae:0.49677
[1686]	validation_0-mae:0.21280
== fold 1 ==
[0]	validation_0-mae:0.52904
[1700]	validation_0-mae:0.21655
== fold 2 ==
[0]	validation_0-mae:0.52863
[2222]	validation_0-mae:0.21948
== fold 3 ==
[0]	validation_0-mae:0.49479
[1387]	validation_0-mae:0.21776
== fold 4 ==
[0]	validation_0-mae:0.54055
[2357]	validation_0-mae:0.22226


 44%|████▍     | 8/18 [06:29<08:05, 48.60s/it]

== fold 0 ==
[0]	validation_0-mae:0.11534
[365]	validation_0-mae:0.11256
== fold 1 ==
[0]	validation_0-mae:0.11421
[582]	validation_0-mae:0.11063
== fold 2 ==
[0]	validation_0-mae:0.11769
[399]	validation_0-mae:0.11458
== fold 3 ==
[0]	validation_0-mae:0.12032
[667]	validation_0-mae:0.11696
== fold 4 ==
[0]	validation_0-mae:0.11716
[816]	validation_0-mae:0.11385


 50%|█████     | 9/18 [06:44<05:42, 38.03s/it]

== fold 0 ==
[0]	validation_0-mae:8.69555
[1769]	validation_0-mae:0.23963
== fold 1 ==
[0]	validation_0-mae:8.70621
[4899]	validation_0-mae:0.24302
== fold 2 ==
[0]	validation_0-mae:8.80491
[4200]	validation_0-mae:0.24154
== fold 3 ==
[0]	validation_0-mae:8.74942
[4177]	validation_0-mae:0.24044
== fold 4 ==
[0]	validation_0-mae:8.54734
[3761]	validation_0-mae:0.24661


 56%|█████▌    | 10/18 [08:10<07:02, 52.86s/it]

== fold 0 ==
[0]	validation_0-mae:0.30753
[1388]	validation_0-mae:0.12966
== fold 1 ==
[0]	validation_0-mae:0.32486
[1918]	validation_0-mae:0.12937
== fold 2 ==
[0]	validation_0-mae:0.32624
[1886]	validation_0-mae:0.13362
== fold 3 ==
[0]	validation_0-mae:0.30421
[1360]	validation_0-mae:0.13124
== fold 4 ==
[0]	validation_0-mae:0.33226
[2333]	validation_0-mae:0.13400


 61%|██████    | 11/18 [08:53<05:48, 49.72s/it]

== fold 0 ==
[0]	validation_0-mae:0.08426
[417]	validation_0-mae:0.08258
== fold 1 ==
[0]	validation_0-mae:0.08308
[423]	validation_0-mae:0.08088
== fold 2 ==
[0]	validation_0-mae:0.08595
[383]	validation_0-mae:0.08387
== fold 3 ==
[0]	validation_0-mae:0.08784
[660]	validation_0-mae:0.08569
== fold 4 ==
[0]	validation_0-mae:0.08493
[759]	validation_0-mae:0.08277


 67%|██████▋   | 12/18 [09:07<03:52, 38.78s/it]

== fold 0 ==
[0]	validation_0-mae:5.69810
[1636]	validation_0-mae:0.14098
== fold 1 ==
[0]	validation_0-mae:5.70668
[2564]	validation_0-mae:0.14095
== fold 2 ==
[0]	validation_0-mae:5.77049
[5439]	validation_0-mae:0.14185
== fold 3 ==
[0]	validation_0-mae:5.73449
[3527]	validation_0-mae:0.13946
== fold 4 ==
[0]	validation_0-mae:5.60044
[2799]	validation_0-mae:0.14164


 72%|███████▏  | 13/18 [10:19<04:05, 49.01s/it]

== fold 0 ==
[0]	validation_0-mae:0.16069
[1181]	validation_0-mae:0.07308
== fold 1 ==
[0]	validation_0-mae:0.16734
[1926]	validation_0-mae:0.07110
== fold 2 ==
[0]	validation_0-mae:0.17003
[1178]	validation_0-mae:0.07459
== fold 3 ==
[0]	validation_0-mae:0.15821
[1467]	validation_0-mae:0.07302
== fold 4 ==
[0]	validation_0-mae:0.17257
[1939]	validation_0-mae:0.07461


 78%|███████▊  | 14/18 [10:57<03:02, 45.50s/it]

== fold 0 ==
[0]	validation_0-mae:0.05456
[316]	validation_0-mae:0.05368
== fold 1 ==
[0]	validation_0-mae:0.05379
[431]	validation_0-mae:0.05269
== fold 2 ==
[0]	validation_0-mae:0.05571
[400]	validation_0-mae:0.05446
== fold 3 ==
[0]	validation_0-mae:0.05645
[1064]	validation_0-mae:0.05522
== fold 4 ==
[0]	validation_0-mae:0.05473
[395]	validation_0-mae:0.05366


 83%|████████▎ | 15/18 [11:10<01:47, 35.85s/it]

== fold 0 ==
[0]	validation_0-mae:2.69955
[1630]	validation_0-mae:0.06531
== fold 1 ==
[0]	validation_0-mae:2.70526
[1607]	validation_0-mae:0.06434
== fold 2 ==
[0]	validation_0-mae:2.73485
[1899]	validation_0-mae:0.06698
== fold 3 ==
[0]	validation_0-mae:2.71804
[1899]	validation_0-mae:0.06543
== fold 4 ==
[0]	validation_0-mae:2.65464
[1431]	validation_0-mae:0.06587


 89%|████████▉ | 16/18 [11:50<01:13, 36.99s/it]

== fold 0 ==
[0]	validation_0-mae:0.05748
[1089]	validation_0-mae:0.03220
== fold 1 ==
[0]	validation_0-mae:0.05904
[842]	validation_0-mae:0.03153
== fold 2 ==
[0]	validation_0-mae:0.06048
[1011]	validation_0-mae:0.03266
== fold 3 ==
[0]	validation_0-mae:0.05639
[1114]	validation_0-mae:0.03184
== fold 4 ==
[0]	validation_0-mae:0.06161
[905]	validation_0-mae:0.03332


 94%|█████████▍| 17/18 [12:15<00:33, 33.57s/it]

== fold 0 ==
[0]	validation_0-mae:0.02615
[369]	validation_0-mae:0.02576
== fold 1 ==
[0]	validation_0-mae:0.02584
[365]	validation_0-mae:0.02544
== fold 2 ==
[0]	validation_0-mae:0.02683
[405]	validation_0-mae:0.02637
== fold 3 ==
[0]	validation_0-mae:0.02685
[446]	validation_0-mae:0.02646
== fold 4 ==
[0]	validation_0-mae:0.02610
[334]	validation_0-mae:0.02565


100%|██████████| 18/18 [12:26<00:00, 41.46s/it]


In [14]:
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.23028796363219692


In [15]:
rev_dict = {v: "oof_" + k for k, v in CATEGORY_MAPPING["target_name"].items()}
oof_df = (
    train.select(["ID", "target_name", "oof"])
    .with_columns(pl.col("target_name").replace_strict(rev_dict))
    .pivot(index="ID", columns="target_name", values="oof")
)
oof_df.head()

ID,oof_x_5,oof_y_5,oof_z_5,oof_x_4,oof_y_4,oof_z_4,oof_x_3,oof_y_3,oof_z_3,oof_x_2,oof_y_2,oof_z_2,oof_x_1,oof_y_1,oof_z_1,oof_x_0,oof_y_0,oof_z_0
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""00066be8e20318869c38c66be46663…",21.455189,-0.437528,0.127063,17.308624,-0.370154,0.085177,13.653478,-0.18745,0.047936,9.444977,-0.097,0.039624,5.862541,-0.042287,0.019358,2.66032,-0.009087,0.0112
"""00066be8e20318869c38c66be46663…",31.392342,-0.869482,0.064034,26.310242,-0.679434,0.088512,21.37081,-0.532647,0.08411,15.887778,-0.414633,0.055777,10.514678,-0.233034,0.035776,5.035405,-0.064468,0.01415
"""00066be8e20318869c38c66be46663…",31.893902,-0.479848,0.117592,26.509876,-0.386165,0.093717,20.977812,-0.236764,0.09642,15.277604,-0.138827,0.053336,10.118437,-0.061507,0.035542,4.717708,-0.016422,0.014091
"""000fb056f97572d384bae4f5fc1e0f…",16.974367,1.198038,-0.007766,14.297729,0.774844,-0.007069,11.57066,0.511329,-0.003318,8.653085,0.271812,-0.007948,5.767129,0.12537,-0.001532,2.725299,0.020627,-0.000482
"""000fb056f97572d384bae4f5fc1e0f…",15.56218,-0.60901,0.05184,11.542909,-0.437678,0.041353,8.824244,-0.426306,0.029741,6.19996,-0.210795,0.003604,3.841709,-0.104585,0.002361,1.598209,-0.039609,0.00284


In [16]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))