In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "lightgbm"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-20 09:07:34.999859: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-20 09:07:35.027056: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_010


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import warnings

import numpy as np
import polars as pl
from IPython.display import clear_output, display
from penguinml.gbdt.lightgbm import fit_lgb, inference_lgb
from tqdm import tqdm

from const import TARGET_COLS
from feature_engineering import feature_engineer

warnings.filterwarnings("ignore")
pl.Config.set_tbl_rows(20)

train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(
        pl.col("offset").cast(pl.Float32),
        pl.lit(False).alias("submit"),
    )
    # .sample(fraction=0.01)
)
test = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "test_features.csv"))
test = (
    test.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(
        pl.col("offset").cast(pl.Float32),
        pl.lit(True).alias("submit"),
    )
)
train = pl.concat([train, test], how="diagonal")
print(train.shape)
train.head(1)

(45098, 33)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset,submit
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32,bool
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0,False


## 特徴量生成


In [5]:
train, features = feature_engineer(train)
print(len(features))

105


In [7]:
# YOLOの検出結果
import json
from glob import glob

yolo_paths = glob("/kaggle/input/yolo-det/det/*.json")
yolo_dfs = []
for path in tqdm(yolo_paths):
    ID = os.path.basename(path).split(".")[0]
    with open(path, "r") as f:
        data = json.load(f)

    yolo_feature = {
        "ID": ID,
        "num_objects": len(data),
    }
    for bbox in data:
        if bbox["x1"] == bbox["x2"] or bbox["y1"] == bbox["y2"]:
            continue

        if bbox["cls"] != "car":
            continue

        # count
        if bbox["cls"] not in yolo_feature:
            yolo_feature[bbox["cls"]] = 0
        yolo_feature[bbox["cls"]] += 1

        # 最も横方向が中央にあるものの情報
        if f"center_x_{bbox['cls']}" not in yolo_feature:
            yolo_feature[f"center_x_{bbox['cls']}"] = -1
        current_dist = abs(yolo_feature[f"center_x_{bbox['cls']}"] - 64)
        now_center_x = (bbox["x1"] + bbox["x2"]) / 2
        now_dist = abs(now_center_x - 64)
        if now_dist < current_dist:
            yolo_feature[f"center_x_{bbox['cls']}"] = now_center_x
            yolo_feature[f"center_y_{bbox['cls']}"] = (bbox["y1"] + bbox["y2"]) / 2
            yolo_feature[f"width_{bbox['cls']}"] = bbox["x2"] - bbox["x1"]
            yolo_feature[f"height_{bbox['cls']}"] = bbox["y2"] - bbox["y1"]
            yolo_feature[f"bottom_{bbox['cls']}"] = bbox["y2"]
            yolo_feature[f"area_{bbox['cls']}"] = (bbox["x2"] - bbox["x1"]) * (bbox["y2"] - bbox["y1"])
            yolo_feature[f"aspect_ratio_{bbox['cls']}"] = (bbox["x2"] - bbox["x1"]) / (bbox["y2"] - bbox["y1"])
            yolo_feature[f"conf_{bbox['cls']}"] = bbox["conf"]
    yolo_dfs.append(yolo_feature)
yolo_df = pl.DataFrame(yolo_dfs)

100%|██████████| 45098/45098 [00:00<00:00, 80060.81it/s]


In [8]:
train = train.join(yolo_df, on="ID", how="left")
features.add_num_features(yolo_df.columns[1:])

In [9]:
len(features)

115

## CV Split


In [10]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
# assert train["fold"].null_count() == 0

## Training


In [11]:
dfs = []
test_dfs = []
scores = []
for c in tqdm(sorted(TARGET_COLS, key=lambda x: x.split("_")[1], reverse=True)):
    this_df = train.filter(pl.col("target_name_original") == c)
    this_train_df = this_df.filter(~pl.col("submit"))
    this_test_df = this_df.filter(pl.col("submit"))

    oof, models = fit_lgb(
        data=this_train_df,
        features=features,
        params=CFG[MODEL_NAME]["params"],
        target_col="target",
        fold_col="fold",
        target_type="regression",
        verbose=50000,
    )
    this_train_df = this_train_df.with_columns(pl.Series("oof", oof))
    dfs.append(this_train_df)

    i = int(c.split("_")[1])
    xyz = c.split("_")[0]
    mae = np.abs(this_train_df["oof"] - this_train_df["target"]).mean()
    score = {"target": c, "cv": mae} | {
        f"fold{fold}": np.abs(
            this_train_df.filter(pl.col("fold") == fold)["oof"]
            - this_train_df.filter(pl.col("fold") == fold)["target"]
        ).mean()
        for fold in range(5)
    }
    scores.append(score)

    # inference
    preds = inference_lgb(
        feat_df=this_test_df.select(features.all_features()),
        models=models,
        agg_func=np.mean,  # TODO: medの方がいいかも
    )
    this_test_df = this_test_df.with_columns(pl.Series("pred", preds))
    test_dfs.append(this_test_df)

    clear_output()
    display(pl.DataFrame(scores).sort("target"))

train = pl.concat(dfs)
test = pl.concat(test_dfs)
clear_output()
display(pl.DataFrame(scores).sort("target"))

target,cv,fold0,fold1,fold2,fold3,fold4
str,f64,f64,f64,f64,f64,f64
"""x_0""",0.061642,0.060916,0.060064,0.063404,0.061368,0.06246
"""x_1""",0.131888,0.130869,0.130292,0.133656,0.130826,0.133797
"""x_2""",0.224992,0.22221,0.224633,0.226127,0.2232,0.228789
"""x_3""",0.351859,0.3472,0.353043,0.353461,0.349018,0.356574
"""x_4""",0.505184,0.502032,0.509341,0.503765,0.502003,0.508781
"""x_5""",0.679321,0.670433,0.687408,0.679472,0.676129,0.683163
"""y_0""",0.031866,0.031667,0.031266,0.032187,0.031475,0.032733
"""y_1""",0.072683,0.072262,0.071038,0.073153,0.072133,0.074826
"""y_2""",0.130384,0.128706,0.128929,0.131818,0.128974,0.133495
"""y_3""",0.214925,0.209272,0.215275,0.216409,0.213124,0.220543


In [12]:
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.2109593054477809


In [13]:
oof_df = (
    train.select(["ID", "target_name_original", "oof"])
    .with_columns(pl.col("target_name_original"))
    .pivot(index="ID", columns="target_name_original", values="oof")
    .sort("ID")
)
submission = (
    test.select(["ID", "target_name_original", "pred"])
    .with_columns(pl.col("target_name_original"))
    .pivot(index="ID", columns="target_name_original", values="pred")
    .sort("ID")
)

In [14]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))
oof_df.head()

ID,x_5,y_5,z_5,x_4,y_4,z_4,x_3,y_3,z_3,x_2,y_2,z_2,x_1,y_1,z_1,x_0,y_0,z_0
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""00066be8e20318869c38c66be46663…",22.688056,-0.583073,0.166723,18.291352,-0.350866,0.128665,13.763765,-0.111687,0.080638,9.514796,-0.061318,0.058638,5.913339,-0.007898,0.025936,2.630653,-0.000919,0.010597
"""00066be8e20318869c38c66be46663…",31.61414,-1.264948,0.158804,26.528651,-0.893639,0.146638,21.220759,-0.646737,0.106837,15.88075,-0.490686,0.084539,10.53155,-0.237475,0.051871,5.030351,-0.073149,0.02123
"""00066be8e20318869c38c66be46663…",31.932585,-0.403885,0.088157,26.706545,-0.326399,0.077558,21.124439,-0.2575,0.053034,15.55707,-0.15827,0.054337,10.100031,-0.081149,0.025321,4.727233,-0.0233,0.011485
"""000fb056f97572d384bae4f5fc1e0f…",18.074964,1.113538,0.005801,15.249245,0.780063,-0.008914,11.856317,0.525219,-0.002196,8.86522,0.290896,0.006711,5.843844,0.122961,0.011625,2.782703,0.020068,0.00504
"""000fb056f97572d384bae4f5fc1e0f…",13.919083,-0.850258,0.087098,11.235675,-0.599973,0.059565,8.706441,-0.423881,0.041888,6.150948,-0.263046,0.025553,3.707369,-0.139703,0.00913,1.554488,-0.043868,0.003815


In [15]:
submission.write_csv(os.path.join(CFG["output_dir"], "submission.csv"))
submission.head()

ID,x_5,y_5,z_5,x_4,y_4,z_4,x_3,y_3,z_3,x_2,y_2,z_2,x_1,y_1,z_1,x_0,y_0,z_0
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""012baccc145d400c896cb82065a93d…",9.186043,-0.185926,0.028913,7.558025,-0.156769,0.022406,6.22061,-0.157573,0.016847,4.612872,-0.128117,0.012314,3.109224,-0.09446,0.006935,1.485909,-0.039148,0.002069
"""012baccc145d400c896cb82065a93d…",5.038744,4.78314,0.016615,4.156984,3.720656,0.013657,3.542033,2.81844,0.004138,2.903178,1.851287,-0.002926,2.045891,1.055126,-0.000955,1.005804,0.411867,-0.002919
"""012baccc145d400c896cb82065a93d…",8.783626,-0.008953,0.011074,7.18713,0.017534,0.01291,6.085115,0.016486,0.004613,4.720412,0.016866,0.006486,3.231803,0.008528,0.003771,1.602638,0.00474,0.001099
"""012baccc145d400c896cb82065a93d…",4.423906,1.914628,-0.031154,3.947476,1.231154,-0.022403,3.127608,0.745722,-0.01115,2.430591,0.396082,-0.012398,1.663854,0.193832,-0.007027,0.858036,0.061191,-0.003623
"""01d738e799d260a10f6324f78023b3…",2.21316,-0.041349,-0.056893,2.142568,-0.025726,-0.054206,2.044318,-0.01643,-0.032966,1.748449,-0.009183,-0.019585,1.467794,-0.000802,-0.009651,0.847401,-0.001399,-0.005109
