In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "xgboost"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-15 10:25:19.476521: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-15 10:25:19.501146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_001


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import warnings

import numpy as np
import polars as pl

warnings.filterwarnings("ignore")

In [5]:
train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(pl.col("offset").cast(pl.Float32))
)
print(train.shape)
train.head(1)

(43371, 32)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0


## 特徴量生成


In [6]:
from penguinml.utils.contena import FeatureContena

features = FeatureContena()
features.add_num_features(
    [
        "vEgo",
        "aEgo",
        "steeringAngleDeg",
        "steeringTorque",
        "brake",
        "brakePressed",
        "gas",
        "gasPressed",
        "leftBlinker",
        "rightBlinker",
        "offset",
    ]
)
features.add_cat_features(["gearShifter"])

## ターゲット列を分解


In [7]:
from const import TARGET_COLS

train = train.unpivot(index="ID", on=TARGET_COLS, variable_name="target_name", value_name="target").join(
    train.drop(TARGET_COLS),
    on="ID",
    how="left",
)
features.add_cat_feature("target_name")
train.head(1)

ID,target_name,target,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,sceneID,offset
str,str,f64,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,str,f32
"""00066be8e20318869c38c66be46663…","""x_0""",2.82959,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,"""00066be8e20318869c38c66be46663…",320.0


In [8]:
from const import CATEGORY_MAPPING

for c in features.num_features():
    train = train.with_columns(pl.col(c).cast(pl.Float32))

for c in features.cat_features():
    mapping = CATEGORY_MAPPING[c]
    train = train.with_columns(pl.col(c).replace_strict(mapping).cast(pl.Int32))

## CV Split


In [9]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
assert train["fold"].null_count() == 0

## Training


In [10]:
from penguinml.gbdt.xgboost import fit_xgb, inference_xgb

oof, models = fit_xgb(
    data=train,
    features=features,
    params=CFG[MODEL_NAME]["params"],
    target_col="target",
    fold_col="fold",
    target_type="regression",
    verbose=500,
)

== fold 0 ==
[0]	validation_0-mae:5.26351
[500]	validation_0-mae:0.29186
[1000]	validation_0-mae:0.26967
[1500]	validation_0-mae:0.26426
[2000]	validation_0-mae:0.26151
[2500]	validation_0-mae:0.26014
[3000]	validation_0-mae:0.25873
[3500]	validation_0-mae:0.25801
[4000]	validation_0-mae:0.25749
[4500]	validation_0-mae:0.25701
[5000]	validation_0-mae:0.25641
[5500]	validation_0-mae:0.25590
[6000]	validation_0-mae:0.25560
[6500]	validation_0-mae:0.25541
[7000]	validation_0-mae:0.25517
[7500]	validation_0-mae:0.25499
[8000]	validation_0-mae:0.25473
[8500]	validation_0-mae:0.25462
[9000]	validation_0-mae:0.25411
[9500]	validation_0-mae:0.25393
[9999]	validation_0-mae:0.25383
== fold 1 ==
[0]	validation_0-mae:5.16328
[500]	validation_0-mae:0.30580
[1000]	validation_0-mae:0.28256
[1500]	validation_0-mae:0.27388
[2000]	validation_0-mae:0.27032
[2500]	validation_0-mae:0.26842
[3000]	validation_0-mae:0.26724
[3500]	validation_0-mae:0.26644
[4000]	validation_0-mae:0.26597
[4500]	validation_0-ma

In [13]:
train = train.with_columns(pl.Series("oof", oof))
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.259294006694998


In [19]:
rev_dict = {v: "oof_" + k for k, v in CATEGORY_MAPPING["target_name"].items()}
oof_df = (
    train.select(["ID", "target_name", "oof"])
    .with_columns(pl.col("target_name").replace_strict(rev_dict))
    .pivot(index="ID", columns="target_name", values="oof")
)
oof_df.head()

ID,oof_x_0,oof_y_0,oof_z_0,oof_x_1,oof_y_1,oof_z_1,oof_x_2,oof_y_2,oof_z_2,oof_x_3,oof_y_3,oof_z_3,oof_x_4,oof_y_4,oof_z_4,oof_x_5,oof_y_5,oof_z_5
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""00066be8e20318869c38c66be46663…",2.762264,0.010862,0.013727,6.096496,-0.012793,0.019767,9.38352,-0.035619,0.045556,12.97892,-0.036059,0.099674,16.085865,-0.099191,0.120864,20.582144,-0.221634,0.122029
"""00066be8e20318869c38c66be46663…",4.908346,-0.042589,0.008057,10.523609,-0.14577,0.019354,16.052059,-0.368332,0.047004,21.243311,-0.522213,0.032412,26.953341,-0.61742,0.055536,31.226437,-1.040025,0.084744
"""00066be8e20318869c38c66be46663…",4.748171,0.013953,0.028808,9.974693,-0.015439,0.03645,15.481208,-0.073113,0.065858,20.798096,-0.127491,0.058521,26.01053,-0.200581,0.053548,32.267723,-0.195666,0.055094
"""000fb056f97572d384bae4f5fc1e0f…",2.765885,-0.001663,-0.031498,6.058972,0.116818,-0.008219,8.761226,0.333205,-0.058305,11.390756,0.617979,-0.036776,15.144131,0.981722,-0.067624,17.172825,1.368344,-0.057977
"""000fb056f97572d384bae4f5fc1e0f…",1.674614,-0.063675,0.008017,3.675838,-0.183655,0.04087,6.335231,-0.390689,0.047238,9.539189,-0.528087,0.079369,12.862332,-0.801395,0.155044,15.00661,-0.825621,0.121842


In [20]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))