In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "lightgbm"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-20 11:00:07.226051: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-20 11:00:07.251738: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_015


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import warnings

import numpy as np
import polars as pl
from IPython.display import clear_output, display
from penguinml.gbdt.lightgbm import fit_lgb, inference_lgb
from tqdm import tqdm

from const import TARGET_COLS
from feature_engineering import feature_engineer

warnings.filterwarnings("ignore")
pl.Config.set_tbl_rows(20)

train = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "train_features.csv"))
train = (
    train.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(
        pl.col("offset").cast(pl.Float32),
        pl.lit(False).alias("submit"),
    )
    # .sample(fraction=0.01)
)
test = pl.read_csv(os.path.join(CFG["dataset"]["competition_dir"], "test_features.csv"))
test = (
    test.with_columns(
        pl.col("ID").str.split_exact("_", n=1).struct.rename_fields(["sceneID", "offset"]).alias("fields")
    )
    .unnest("fields")
    .with_columns(
        pl.col("offset").cast(pl.Float32),
        pl.lit(True).alias("submit"),
    )
)
train = pl.concat([train, test], how="diagonal")
print(train.shape)
train.head(1)

(45098, 33)


ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,sceneID,offset,submit
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f32,bool
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320.0,False


## 特徴量生成


In [5]:
train, features = feature_engineer(train)
print(len(features))

105


In [6]:
# YOLOの検出結果
import json
from glob import glob

yolo_paths = glob("/kaggle/input/yolo-det/det/*.json")
yolo_dfs = []
for path in tqdm(yolo_paths):
    ID = os.path.basename(path).split(".")[0]
    with open(path, "r") as f:
        data = json.load(f)

    yolo_feature = {
        "ID": ID,
        "num_objects": len(data),
    }
    for bbox in data:
        if bbox["x1"] == bbox["x2"] or bbox["y1"] == bbox["y2"]:
            continue

        if bbox["cls"] != "car":
            continue

        # count
        if bbox["cls"] not in yolo_feature:
            yolo_feature[bbox["cls"]] = 0
        yolo_feature[bbox["cls"]] += 1

        # 最も横方向が中央にあるものの情報
        if f"center_x_{bbox['cls']}" not in yolo_feature:
            yolo_feature[f"center_x_{bbox['cls']}"] = -1
        current_dist = abs(yolo_feature[f"center_x_{bbox['cls']}"] - 64)
        now_center_x = (bbox["x1"] + bbox["x2"]) / 2
        now_dist = abs(now_center_x - 64)
        if now_dist < current_dist:
            yolo_feature[f"center_x_{bbox['cls']}"] = now_center_x
            yolo_feature[f"center_y_{bbox['cls']}"] = (bbox["y1"] + bbox["y2"]) / 2
            yolo_feature[f"width_{bbox['cls']}"] = bbox["x2"] - bbox["x1"]
            yolo_feature[f"height_{bbox['cls']}"] = bbox["y2"] - bbox["y1"]
            yolo_feature[f"bottom_{bbox['cls']}"] = bbox["y2"]
            yolo_feature[f"area_{bbox['cls']}"] = (bbox["x2"] - bbox["x1"]) * (bbox["y2"] - bbox["y1"])
            yolo_feature[f"aspect_ratio_{bbox['cls']}"] = (bbox["x2"] - bbox["x1"]) / (bbox["y2"] - bbox["y1"])
            yolo_feature[f"conf_{bbox['cls']}"] = bbox["conf"]
    yolo_dfs.append(yolo_feature)
yolo_df = pl.DataFrame(yolo_dfs)

100%|██████████| 45098/45098 [00:00<00:00, 81664.31it/s]


In [7]:
train = train.join(yolo_df, on="ID", how="left")
features.add_num_features(yolo_df.columns[1:])

In [8]:
len(features)

115

## CV Split


In [9]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"])
train = train.join(train_folds, on="sceneID", how="left")
# assert train["fold"].null_count() == 0

## Training


In [10]:
dfs = []
test_dfs = []
scores = []
for c in tqdm(sorted(TARGET_COLS, key=lambda x: x.split("_")[1], reverse=True)):
    this_df = train.filter(pl.col("target_name_original") == c)
    this_train_df = this_df.filter(~pl.col("submit"))
    this_test_df = this_df.filter(pl.col("submit"))

    oof, models = fit_lgb(
        data=this_train_df,
        features=features,
        params=CFG[MODEL_NAME]["params"],
        target_col="target",
        fold_col="fold",
        target_type="regression",
        verbose=50000,
    )
    this_train_df = this_train_df.with_columns(pl.Series("oof", oof))
    dfs.append(this_train_df)

    i = int(c.split("_")[1])
    xyz = c.split("_")[0]
    mae = np.abs(this_train_df["oof"] - this_train_df["target"]).mean()
    score = {"target": c, "cv": mae} | {
        f"fold{fold}": np.abs(
            this_train_df.filter(pl.col("fold") == fold)["oof"]
            - this_train_df.filter(pl.col("fold") == fold)["target"]
        ).mean()
        for fold in range(5)
    }
    scores.append(score)

    # inference
    preds = inference_lgb(
        feat_df=this_test_df.select(features.all_features()),
        models=models,
        agg_func=np.mean,  # TODO: medの方がいいかも
    )
    this_test_df = this_test_df.with_columns(pl.Series("pred", preds))
    test_dfs.append(this_test_df)

    clear_output()
    display(pl.DataFrame(scores).sort("target"))

train = pl.concat(dfs)
test = pl.concat(test_dfs)
clear_output()
display(pl.DataFrame(scores).sort("target"))

target,cv,fold0,fold1,fold2,fold3,fold4
str,f64,f64,f64,f64,f64,f64
"""x_0""",0.061581,0.061073,0.060015,0.063305,0.06101,0.062505
"""x_1""",0.130709,0.130119,0.129176,0.132793,0.129463,0.131992
"""x_2""",0.223579,0.222053,0.223639,0.224486,0.220674,0.227041
"""x_3""",0.347487,0.344095,0.34955,0.349762,0.342804,0.351222
"""x_4""",0.496314,0.492074,0.497459,0.498866,0.492115,0.501058
"""x_5""",0.664857,0.657511,0.665983,0.668059,0.656885,0.675845
"""y_0""",0.032619,0.032543,0.032042,0.032734,0.032108,0.033669
"""y_1""",0.07307,0.073208,0.071566,0.073867,0.072185,0.074524
"""y_2""",0.130187,0.129474,0.128777,0.131261,0.129454,0.131968
"""y_3""",0.214295,0.21143,0.213703,0.215971,0.212941,0.217429


In [11]:
mae = np.abs(train["oof"] - train["target"]).mean()
print(f"MAE: {mae}")

MAE: 0.20950319991039687


In [12]:
oof_df = (
    train.select(["ID", "target_name_original", "oof"])
    .with_columns(pl.col("target_name_original"))
    .pivot(index="ID", columns="target_name_original", values="oof")
    .sort("ID")
)
submission = (
    test.select(["ID", "target_name_original", "pred"])
    .with_columns(pl.col("target_name_original"))
    .pivot(index="ID", columns="target_name_original", values="pred")
    .sort("ID")
)

In [13]:
oof_df.write_csv(os.path.join(CFG["output_dir"], "oof.csv"))
oof_df.head()

ID,x_5,y_5,z_5,x_4,y_4,z_4,x_3,y_3,z_3,x_2,y_2,z_2,x_1,y_1,z_1,x_0,y_0,z_0
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""00066be8e20318869c38c66be46663…",21.404294,-0.548819,0.143728,17.272427,-0.373115,0.097867,13.30097,-0.25679,0.071258,9.453123,-0.077115,0.047243,5.900888,-0.026699,0.022536,2.632121,-0.005711,0.007567
"""00066be8e20318869c38c66be46663…",31.318925,-1.174317,0.190615,26.115334,-0.9014,0.152871,21.237533,-0.620643,0.113286,15.842749,-0.4343,0.079549,10.522006,-0.230986,0.037295,5.015785,-0.071778,0.015378
"""00066be8e20318869c38c66be46663…",32.168402,-0.575897,0.110945,26.553537,-0.519629,0.070444,20.871066,-0.381524,0.05382,15.465241,-0.178955,0.039162,10.097823,-0.084873,0.021224,4.743197,-0.021383,0.00826
"""000fb056f97572d384bae4f5fc1e0f…",18.199935,1.340657,0.013465,15.115479,0.961828,0.010586,11.84321,0.577857,0.00853,8.79012,0.284482,0.01477,5.839181,0.101221,0.008654,2.766731,0.017821,0.003586
"""000fb056f97572d384bae4f5fc1e0f…",15.070288,-0.626997,0.075161,11.859069,-0.431743,0.056891,8.986238,-0.312678,0.038545,6.228007,-0.266809,0.022853,3.659009,-0.136095,0.00897,1.577929,-0.043281,0.002934


In [14]:
submission.write_csv(os.path.join(CFG["output_dir"], "submission.csv"))
submission.head()

ID,x_5,y_5,z_5,x_4,y_4,z_4,x_3,y_3,z_3,x_2,y_2,z_2,x_1,y_1,z_1,x_0,y_0,z_0
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""012baccc145d400c896cb82065a93d…",8.894346,-0.211703,0.003102,7.463917,-0.241834,0.015837,6.055315,-0.181995,0.013442,4.528247,-0.160721,0.013288,3.009286,-0.089467,0.007055,1.473111,-0.045972,0.003985
"""012baccc145d400c896cb82065a93d…",4.869843,4.233012,0.003207,4.042703,3.371983,0.00587,3.395709,2.390243,-0.007166,2.727319,1.708474,-0.00101,1.821111,1.008175,-0.003333,0.957628,0.377173,-0.000307
"""012baccc145d400c896cb82065a93d…",9.093722,-0.051671,0.004188,7.436154,-0.018273,0.013927,6.156919,-0.044696,0.012126,4.715283,-0.066803,0.011577,3.169109,-0.051066,0.009505,1.562905,-0.006655,0.006301
"""012baccc145d400c896cb82065a93d…",4.8481,2.127355,-0.021652,3.908397,1.313031,-0.016852,3.180643,0.854329,-0.016125,2.456781,0.48514,-0.010247,1.68152,0.248885,-0.007206,0.849306,0.075631,-0.001467
"""01d738e799d260a10f6324f78023b3…",2.060174,0.009736,-0.043525,2.059221,-0.00878,-0.031389,2.041783,-0.013679,-0.028416,1.824939,-0.015205,-0.014188,1.45983,-0.017157,-0.002993,0.837501,-0.004008,-0.001507
