In [12]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [13]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [14]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "xgboost"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

exp_004


set seed: 46


In [15]:
import warnings

import numpy as np
import polars as pl

warnings.filterwarnings("ignore")

In [16]:
train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(pl.col("offset").cast(pl.Float32))
)
print(train.shape)
train.head(1)

(43371, 32)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0


## 特徴量生成


In [17]:
from penguinml.utils.contena import FeatureContena

features = FeatureContena()
features.add_num_features(
    [
        "vEgo",
        "aEgo",
        "steeringAngleDeg",
        "steeringTorque",
        "brake",
        "brakePressed",
        "gas",
        "gasPressed",
        "leftBlinker",
        "rightBlinker",
        "offset",
    ]
)
features.add_cat_features(["gearShifter"])

In [18]:
# train = train.with_columns(
#     (pl.col("vEgo") / pl.col("aEgo")).alias("vEgo/aEgo"),
# )
# features.add_num_features(["vEgo/aEgo"])

## ターゲット列を分解


In [19]:
from const import TARGET_COLS

train = (
    train.unpivot(index="ID", on=TARGET_COLS, variable_name="target_name", value_name="target")
    .join(
        train.drop(TARGET_COLS),
        on="ID",
        how="left",
    )
    .with_columns(
        (
            pl.col("target_name").map_elements(lambda x: float(x.split("_")[1]), return_dtype=pl.Float32) * 0.5 + 0.5
        ).alias("dt"),
        pl.col("target_name").map_elements(lambda x: x.split("_")[0], return_dtype=str).alias("xyz"),
    )
)
features.add_cat_features(["target_name", "xyz"])
train.head(1)

ID,target_name,target,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,sceneID,offset,dt,xyz
str,str,f64,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,str,f32,f32,str
"""00066be8e20318869c38c66be46663…","""x_0""",2.82959,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,"""00066be8e20318869c38c66be46663…",320.0,0.5,"""x"""


In [20]:
# dt秒後の特徴
train = train.with_columns(
    # vt
    (pl.col("vEgo") * pl.col("dt").cast(pl.Float32)).alias("linear_movement@dt"),
    # vt + 0.5at^2
    ((pl.col("vEgo") + 0.5 * pl.col("aEgo") * pl.col("dt").cast(pl.Float32) ** 2).alias("movement@dt")),
    # v + at
    (pl.col("vEgo") + pl.col("aEgo") * pl.col("dt").cast(pl.Float32)).alias("velocity@dt"),
    # # cos
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.cos(np.deg2rad(x)), return_dtype=pl.Float32)).alias("cos"),
    # # sin
    # (pl.col("steeringAngleDeg").map_elements(lambda x: np.sin(np.deg2rad(x)), return_dtype=pl.Float32)).alias("sin"),
).with_columns(
    # (pl.col("movement@dt") * pl.col("cos")).alias("movement@dt*cos"),
    # (pl.col("movement@dt") * pl.col("sin")).alias("movement@dt*sin"),
    # (pl.col("linear_movement@dt") * pl.col("cos")).alias("linear_movement@dt*cos"),
    # (pl.col("linear_movement@dt") * pl.col("sin")).alias("linear_movement@dt*sin"),
    # (pl.col("velocity@dt") * pl.col("cos")).alias("velocity@dt*cos"),
    # (pl.col("velocity@dt") * pl.col("sin")).alias("velocity@dt*sin"),
)

features.add_num_features(
    [
        "linear_movement@dt",
        "movement@dt",
        "velocity@dt",
        # "cos",
        # "sin",
        # "movement@dt*cos",
        # "movement@dt*sin",
        # "linear_movement@dt*cos",
        # "linear_movement@dt*sin",
        # "velocity@dt*cos",
        # "velocity@dt*sin",
    ]
)

In [21]:
# シーン内の集約特徴量
train = train.with_columns(
    pl.col("offset").max().over("sceneID").alias("offset_max"),
    pl.col("offset").min().over("sceneID").alias("offset_min"),
    pl.col("movement@dt").mean().over("sceneID").alias("movement@dt_mean"),
)
features.add_num_features(["offset_max", "offset_min", "movement@dt_mean"])

In [22]:
from const import CATEGORY_MAPPING

for c in features.num_features():
    train = train.with_columns(pl.col(c).cast(pl.Float32))

for c in features.cat_features():
    mapping = CATEGORY_MAPPING[c]
    train = train.with_columns(pl.col(c).replace_strict(mapping).cast(pl.Int32))

## CV Split


In [23]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
assert train["fold"].null_count() == 0

## Training


In [24]:
from penguinml.gbdt.xgboost import fit_xgb, inference_xgb

oof, models = fit_xgb(
    data=train,
    features=features,
    params=CFG[MODEL_NAME]["params"],
    target_col="target",
    fold_col="fold",
    target_type="regression",
    verbose=500,
)

== fold 0 ==
[0]	validation_0-mae:5.29697
[500]	validation_0-mae:0.25255
[1000]	validation_0-mae:0.24417
[1500]	validation_0-mae:0.24074
[2000]	validation_0-mae:0.23887
[2500]	validation_0-mae:0.23744
[3000]	validation_0-mae:0.23631
[3500]	validation_0-mae:0.23568
[4000]	validation_0-mae:0.23514
[4500]	validation_0-mae:0.23489
[5000]	validation_0-mae:0.23459
[5500]	validation_0-mae:0.23447
[6000]	validation_0-mae:0.23437
[6500]	validation_0-mae:0.23428
[7000]	validation_0-mae:0.23415
[7500]	validation_0-mae:0.23411
[8000]	validation_0-mae:0.23406
[8500]	validation_0-mae:0.23401
[9000]	validation_0-mae:0.23398
[9500]	validation_0-mae:0.23393
[10000]	validation_0-mae:0.23386
[10500]	validation_0-mae:0.23382
[11000]	validation_0-mae:0.23380
[11500]	validation_0-mae:0.23376
[12000]	validation_0-mae:0.23374
[12500]	validation_0-mae:0.23373
[13000]	validation_0-mae:0.23372
[13500]	validation_0-mae:0.23369
[13782]	validation_0-mae:0.23368
== fold 1 ==
[0]	validation_0-mae:5.19917
[500]	valida

In [25]:
train = train.with_columns(pl.Series("oof", oof))
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.23793598021436901


In [26]:
rev_dict = {v: "oof_" + k for k, v in CATEGORY_MAPPING["target_name"].items()}
oof_df = (
    train.select(["ID", "target_name", "oof"])
    .with_columns(pl.col("target_name").replace_strict(rev_dict))
    .pivot(index="ID", columns="target_name", values="oof")
)
oof_df.head()

ID,oof_x_0,oof_y_0,oof_z_0,oof_x_1,oof_y_1,oof_z_1,oof_x_2,oof_y_2,oof_z_2,oof_x_3,oof_y_3,oof_z_3,oof_x_4,oof_y_4,oof_z_4,oof_x_5,oof_y_5,oof_z_5
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""00066be8e20318869c38c66be46663…",2.610721,0.01667,0.036748,5.754883,-0.000767,0.070798,9.273259,-0.053855,0.086155,13.400752,-0.093539,0.10472,17.349228,-0.124245,0.154924,21.129898,-0.159528,0.194338
"""00066be8e20318869c38c66be46663…",4.948875,-0.109268,0.018639,10.379826,-0.232712,0.02561,16.258934,-0.489957,0.01099,21.829414,-0.671362,0.027392,27.149614,-0.981284,0.015576,32.848,-1.012712,0.066774
"""00066be8e20318869c38c66be46663…",4.719566,-0.013497,0.050126,10.139306,-0.051783,0.079614,15.471722,-0.11778,0.083318,20.711473,-0.164968,0.102695,26.159332,-0.239052,0.10897,31.790152,-0.297961,0.123865
"""000fb056f97572d384bae4f5fc1e0f…",2.774982,0.015447,-0.005652,5.806592,0.145121,0.003463,8.874857,0.343834,-0.012123,11.639286,0.574048,-0.012894,14.452477,0.90752,-0.030548,17.002306,1.135498,-0.028062
"""000fb056f97572d384bae4f5fc1e0f…",1.612542,-0.037148,0.012182,3.61493,-0.138384,0.033436,6.43594,-0.304285,0.058137,9.121244,-0.434751,0.082854,11.567533,-0.589202,0.109267,14.99752,-0.869779,0.116967


In [27]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))