In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "lightgbm"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-16 14:36:29.368347: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-16 14:36:29.396694: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_007


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import warnings

import numpy as np
import polars as pl
from IPython.display import clear_output, display
from penguinml.gbdt.lightgbm import fit_lgb, inference_lgb
from tqdm import tqdm

from const import TARGET_COLS
from feature_engineering import feature_engineer

warnings.filterwarnings("ignore")
pl.Config.set_tbl_rows(20)

train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(pl.col("offset").cast(pl.Float32))
)
print(train.shape)
train.head(1)

(43371, 32)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0


## 特徴量生成


In [5]:
train, features = feature_engineer(train)
print(len(features))

105


## CV Split


In [6]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
assert train["fold"].null_count() == 0

## Training


In [7]:
dfs = []
scores = []
for c in tqdm(sorted(TARGET_COLS, key=lambda x: x.split("_")[1], reverse=True)):
    this_df = train.filter(pl.col("target_name_original") == c)
    oof, models = fit_lgb(
        data=this_df,
        features=features,
        params=CFG[MODEL_NAME]["params"],
        target_col="target",
        fold_col="fold",
        target_type="regression",
        verbose=50000,
    )
    this_df = this_df.with_columns(pl.Series("oof", oof))
    dfs.append(this_df)

    i = int(c.split("_")[1])
    xyz = c.split("_")[0]
    mae = np.abs(this_df["oof"] - this_df["target"]).mean()
    score = {"target": c, "cv": mae} | {
        f"fold{fold}": np.abs(
            this_df.filter(pl.col("fold") == fold)["oof"] - this_df.filter(pl.col("fold") == fold)["target"]
        ).mean()
        for fold in range(5)
    }
    scores.append(score)

    clear_output()
    display(pl.DataFrame(scores).sort("target"))

train = pl.concat(dfs)
clear_output()
display(pl.DataFrame(scores).sort("target"))

target,cv,fold0,fold1,fold2,fold3,fold4
str,f64,f64,f64,f64,f64,f64
"""x_0""",0.062228,0.061607,0.060437,0.064051,0.062018,0.063028
"""x_1""",0.133217,0.132209,0.131223,0.134886,0.132583,0.135185
"""x_2""",0.227647,0.22636,0.225528,0.228695,0.225839,0.231812
"""x_3""",0.353711,0.350247,0.356753,0.35488,0.349608,0.357069
"""x_4""",0.506266,0.500229,0.51357,0.504939,0.50326,0.509334
"""x_5""",0.681033,0.669134,0.695178,0.683182,0.672523,0.685152
"""y_0""",0.031854,0.031677,0.031167,0.032066,0.031526,0.032835
"""y_1""",0.07273,0.072389,0.070941,0.073313,0.072118,0.074888
"""y_2""",0.130772,0.12823,0.129308,0.132541,0.129663,0.134119
"""y_3""",0.216024,0.209624,0.215428,0.219016,0.214693,0.221362


In [8]:
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.21232987001744488


In [9]:
oof_df = (
    train.select(["ID", "target_name_original", "oof"])
    .with_columns(pl.col("target_name_original"))
    .pivot(index="ID", columns="target_name_original", values="oof")
)
oof_df.head()

ID,x_5,y_5,z_5,x_4,y_4,z_4,x_3,y_3,z_3,x_2,y_2,z_2,x_1,y_1,z_1,x_0,y_0,z_0
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""00066be8e20318869c38c66be46663…",22.038067,-0.558047,0.165707,18.227562,-0.355071,0.090493,13.738854,-0.133294,0.06545,9.386853,-0.083747,0.040578,5.896475,-0.02493,0.017988,2.687078,0.005774,0.010114
"""00066be8e20318869c38c66be46663…",31.710907,-0.764475,0.161883,26.851383,-0.632574,0.113349,21.139854,-0.597363,0.125833,15.80733,-0.394979,0.079982,10.555211,-0.232158,0.039127,4.989698,-0.068797,0.019064
"""00066be8e20318869c38c66be46663…",32.007977,-0.346182,0.096317,26.674847,-0.325944,0.053631,21.30284,-0.226537,0.066925,15.554021,-0.112965,0.052491,10.008571,-0.069704,0.016693,4.713456,-0.018526,0.012189
"""000fb056f97572d384bae4f5fc1e0f…",14.176632,-0.837255,0.09058,11.271188,-0.646364,0.062475,8.824808,-0.374933,0.046369,6.080137,-0.208632,0.021687,3.688029,-0.123612,0.010476,1.569126,-0.048624,0.002188
"""000fb056f97572d384bae4f5fc1e0f…",17.894393,1.179346,0.008142,14.965732,0.795913,0.007926,11.874545,0.493164,0.008657,8.831929,0.292942,-0.009191,5.835719,0.114583,0.004302,2.763021,0.022862,0.004882


In [10]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))