In [1]:
!rm -r /kaggle/working/*
%cd /kaggle/working

/kaggle/working


In [2]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [3]:
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base

MODEL_NAME = "lightgbm"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger(f"{ CFG[MODEL_NAME]['execution']['exp_id']}.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

2024-11-20 11:41:29.579204: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-20 11:41:29.607161: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


exp_016


  pid, fd = os.forkpty()
set seed: 46


In [4]:
import glob
import os
import pickle
import warnings
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


In [5]:
class Config:
    N_FOLD = 5
    RANDOM_SATE = 42


NB = "exp1015"

In [6]:
ROOT_DIR = Path("/kaggle")
DATA_DIR = ROOT_DIR / Path("input/atmaCup#18_dataset")

In [7]:
train_df = pl.read_csv(DATA_DIR / "train_features.csv")
test_df = pl.read_csv(DATA_DIR / "test_features.csv")

## 特徴量生成


In [8]:
# train_df と test_dfを結合（特徴量エンジニアリングをしやすくするため）
_all_df = pl.concat([train_df, test_df], how="diagonal")

In [9]:
agg_cols = [
    "vEgo",
    "aEgo",
    "steeringAngleDeg",
    "steeringTorque",
    "gas",
]  # 同一シーンから集計する値のカラム名

# 同一シーンから特徴量作成
exprs = []
exprs += [
    pl.col(agg_col).shift(-1).over("scene").alias(f"{agg_col}_shift-1") for agg_col in agg_cols
]  # 1ステップ前の時間の値
exprs += [
    pl.col(agg_col).shift(1).over("scene").alias(f"{agg_col}_shift1") for agg_col in agg_cols
]  # 1ステップ後の時間の値
exprs += [
    pl.col(agg_col).diff(-1).over("scene").alias(f"{agg_col}_diff-1") for agg_col in agg_cols
]  # 1ステップ前の時間の値との差分
exprs += [
    pl.col(agg_col).diff(1).over("scene").alias(f"{agg_col}_diff1") for agg_col in agg_cols
]  # 1ステップ後の時間の値との差分
exprs += [pl.col(agg_col).mean().over("scene").alias(f"{agg_col}_mean") for agg_col in agg_cols]  # 同一シーンの平均値
exprs += [pl.col(agg_col).std().over("scene").alias(f"{agg_col}_std") for agg_col in agg_cols]  # 同一シーンの標準偏差
exprs += [pl.col(agg_col).max().over("scene").alias(f"{agg_col}_max") for agg_col in agg_cols]  # 同一シーンの最大値
exprs += [pl.col(agg_col).min().over("scene").alias(f"{agg_col}_min") for agg_col in agg_cols]  # 同一シーンの最小値

_all_df = (
    _all_df.with_columns(
        # ID からシーンとデシ秒を作成
        pl.col("ID").str.split("_").list.get(0).alias("scene"),
        pl.col("ID").str.split("_").list.get(1).cast(pl.Int32).alias("decisecond"),
    )
    .sort(
        # shiftと diffが時系列順に並んでいる必要があるためシーンごとに時間軸でソート
        "scene",
        "decisecond",
    )
    .with_columns(exprs)
)

In [10]:
train_folds = pl.read_csv(CFG["dataset"]["train_fold_path"]).rename({"sceneID": "scene"})
_all_df = _all_df.join(train_folds, how="left", on="scene")
# assert train_df["fold"].null_count() == 0

## feature and target


In [11]:
targets = [
    "x_0",
    "y_0",
    "z_0",
    "x_1",
    "y_1",
    "z_1",
    "x_2",
    "y_2",
    "z_2",
    "x_3",
    "y_3",
    "z_3",
    "x_4",
    "y_4",
    "z_4",
    "x_5",
    "y_5",
    "z_5",
]

# 使う特徴量を指定するより使わない特徴量を指定するほうが試行錯誤が楽
del_columns = targets + ["ID", "scene", "gearShifter"]

features = list(set(_all_df.columns) - set(del_columns))
features.sort()

print(features)

['aEgo', 'aEgo_diff-1', 'aEgo_diff1', 'aEgo_max', 'aEgo_mean', 'aEgo_min', 'aEgo_shift-1', 'aEgo_shift1', 'aEgo_std', 'brake', 'brakePressed', 'decisecond', 'fold', 'gas', 'gasPressed', 'gas_diff-1', 'gas_diff1', 'gas_max', 'gas_mean', 'gas_min', 'gas_shift-1', 'gas_shift1', 'gas_std', 'leftBlinker', 'rightBlinker', 'steeringAngleDeg', 'steeringAngleDeg_diff-1', 'steeringAngleDeg_diff1', 'steeringAngleDeg_max', 'steeringAngleDeg_mean', 'steeringAngleDeg_min', 'steeringAngleDeg_shift-1', 'steeringAngleDeg_shift1', 'steeringAngleDeg_std', 'steeringTorque', 'steeringTorque_diff-1', 'steeringTorque_diff1', 'steeringTorque_max', 'steeringTorque_mean', 'steeringTorque_min', 'steeringTorque_shift-1', 'steeringTorque_shift1', 'steeringTorque_std', 'vEgo', 'vEgo_diff-1', 'vEgo_diff1', 'vEgo_max', 'vEgo_mean', 'vEgo_min', 'vEgo_shift-1', 'vEgo_shift1', 'vEgo_std']


In [12]:
# MAEを計算
def evaluation(true_values, pred_values):
    abs_diff = abs(true_values - pred_values)
    mae = np.mean(
        abs_diff.reshape(
            -1,
        )
    )
    return mae

### encoding


In [13]:
# label encdoding
categorical_columns = ["gearShifter"]

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    _all_df = _all_df.with_columns(pl.Series(le.fit_transform(_all_df[col])).alias(f"{col}_le"))
cate_features = [f"{col}_le" for c in categorical_columns]
features = list(set(features) | set(cate_features))

# count encoding
count_enc = ["gearShifter"]
_all_df = _all_df.with_columns([pl.col(c).count().over(c).alias(f"{c}_count") for c in count_enc])
count_features = [f"{c}_count" for c in count_enc]
features = list(set(features) | set(count_features))


train_df = train_df.join(_all_df, how="left", on="ID")
test_df = test_df.join(_all_df, how="left", on="ID")

In [14]:
def train_lgbm(target):
    params = {
        "boosting_type": "gbdt",
        "metric": "mae",  # 今回の評価指標がMAEを使用
        "objective": "regression",
        "n_jobs": -1,
        "seed": Config.RANDOM_SATE,
        "learning_rate": 0.01,
        # "device": "gpu"
        "verbosity": -1,
    }

    oof_pred = np.zeros(len(train_df))
    y_pred = np.zeros(len(test_df))
    models = []
    cv_scores = {}

    for fold in range(5):
        print(f"fold{fold}: ", end="")

        # TrainとTestに分割
        x_train = train_df.filter(pl.col("fold") != fold).select(features).drop(["fold"])
        x_val = train_df.filter(pl.col("fold") == fold).select(features).drop(["fold"])
        y_train = train_df.filter(pl.col("fold") != fold).select(target)
        y_val = train_df.filter(pl.col("fold") == fold).select(target)

        test = test_df[features]

        # create Dataset
        train_set = lgb.Dataset(
            x_train.to_pandas(),
            y_train.to_pandas(),
            categorical_feature=cate_features,
            free_raw_data=False,
        )
        val_set = lgb.Dataset(
            x_val.to_pandas(),
            y_val.to_pandas(),
            categorical_feature=cate_features,
            free_raw_data=False,
        )

        # train
        model = lgb.train(
            params,
            train_set,
            valid_sets=[train_set, val_set],
            num_boost_round=10000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False),
                lgb.log_evaluation(500000),
            ],
        )

        models.append(model)

        fold_pred = model.predict(x_val.to_pandas())

        score = evaluation(y_val.to_numpy().reshape(-1), fold_pred)
        cv_scores[f"cv{fold}"] = score

        # oof_pred[test_index] = fold_pred
        oof_pred[train_df["fold"].to_numpy() == fold] = fold_pred

        y_pred += model.predict(test.drop(["fold"]).to_pandas()) / Config.N_FOLD

        print(f"{score}")

    oof_score = evaluation(train_df[target].to_numpy().reshape(-1), oof_pred)
    print(f"OOF score is {oof_score}")

    return oof_pred, y_pred, models

## 学習


In [15]:
models_dict = {}
test_pred = []
oof_pred = []
for target in targets:
    print("=" * 50)
    print(f"# {target}")
    print("=" * 50)
    oof_preds_partial, y_pred_partial, models_partial = train_lgbm(target)
    models_dict[target] = models_partial
    oof_pred.append(oof_preds_partial)
    test_pred.append(y_pred_partial)

# x_0
fold0: 0.061814169501656906
fold1: 0.06044789503792708
fold2: 0.06332190411798463
fold3: 0.06172711954156965
fold4: 0.062372365740872474
OOF score is 0.06193668796304339
# y_0
fold0: 0.03245544016507444
fold1: 0.03185520490336907
fold2: 0.03263411762208743
fold3: 0.03205547841499206
fold4: 0.03335351742104342
OOF score is 0.03247075135227692
# z_0
fold0: 0.025887936299316603
fold1: 0.02540527160511098
fold2: 0.026368891925292744
fold3: 0.02639706753113625
fold4: 0.025721373015107796
OOF score is 0.025956106503364326
# x_1
fold0: 0.13117405566603507
fold1: 0.13086497168094022
fold2: 0.13358152683247207
fold3: 0.13092803577582302
fold4: 0.13391680054839308
OOF score is 0.13209305691094148
# y_1
fold0: 0.07391652032527428
fold1: 0.07235495449610779
fold2: 0.07499963463707451
fold3: 0.07314786954132887
fold4: 0.07485061458743036
OOF score is 0.07385392016084101
# z_1
fold0: 0.05400884903387175
fold1: 0.05267174383663384
fold2: 0.05463740210261288
fold3: 0.0552795651023596
fold4: 0.05

In [16]:
evaluation(train_df[targets].to_numpy(), np.vstack(oof_pred).T)

0.21186945780540176

# submit ファイル作成


In [17]:
sub_col_names = [
    "x_0",
    "y_0",
    "z_0",
    "x_1",
    "y_1",
    "z_1",
    "x_2",
    "y_2",
    "z_2",
    "x_3",
    "y_3",
    "z_3",
    "x_4",
    "y_4",
    "z_4",
    "x_5",
    "y_5",
    "z_5",
]
sub_df = pl.DataFrame(np.vstack(test_pred).T, schema=sub_col_names)
display(sub_df)
sub_df.write_csv("submission.csv")

x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.452,-0.051075,0.002424,3.027946,-0.116615,0.004675,4.54256,-0.196589,0.002783,5.820129,-0.206179,0.006065,7.303749,-0.120701,-0.004875,8.58239,-0.002588,-0.003567
0.943627,0.388967,-0.000333,1.743524,1.00377,-0.002847,2.384993,1.784852,-0.008241,2.763791,2.616801,-0.010621,3.510101,3.57755,-0.007681,4.336799,4.561301,0.00061
1.570422,0.015986,0.003826,3.261026,0.016607,0.007196,4.878119,0.045818,0.012156,6.334183,0.053155,0.012668,7.705174,0.133657,0.0156,8.983753,0.197791,0.021751
0.834648,0.064968,-0.00139,1.650741,0.221638,-0.00699,2.396598,0.551637,-0.012376,2.972891,0.862205,-0.019565,3.614984,1.525669,-0.020291,4.288629,2.244856,-0.016445
0.817229,0.004749,-0.001976,1.415719,0.003891,-0.004815,1.881919,-0.000778,-0.017307,2.279591,-0.000394,-0.037908,2.172891,-0.006109,-0.054205,1.701601,-0.021986,-0.069277
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
6.536803,0.0093,0.017823,13.803687,0.049009,0.039545,21.004637,0.132271,0.065576,28.269154,0.221015,0.074146,35.556963,0.329059,0.080569,43.086334,0.452818,0.070313
7.000525,0.001282,0.004378,14.891301,-0.009974,0.008282,22.943261,-0.027871,0.013072,31.100466,-0.065433,0.023533,39.291845,-0.101419,0.03098,47.540539,-0.134006,0.037108
7.416801,-0.000922,0.007246,15.665297,-0.021421,0.011577,23.896622,-0.053737,0.023101,32.061631,-0.118897,0.047292,40.141177,-0.191456,0.077176,48.077369,-0.26432,0.090351
6.524221,-0.000094,-0.002061,13.669287,-0.009227,-0.004767,20.767928,-0.008719,-0.008711,27.829623,-0.008356,-0.012927,34.828611,-0.019157,-0.015435,41.833203,0.015958,-0.015824
