In [46]:
from typing import *
from pathlib import Path
import numpy as np
import pandas as pd
import lightgbm as lgb
import pickle
from time import time
from functools import lru_cache
from sklearn.metrics import mean_squared_error as mse

root = Path("/data/natsuki/dataset_atmaCup11")
"atma11sortingdate_j4nofreeze"
names = ["atma11simple_j4e5nofreeze", "atma11sortingdate_j4e5nofreeze", "atma11onehot_j4e5nofreeze", "atma11materialstechniques_j4e5nofreeze"]
epoch = 10

# https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9, #学習の高速化と過学習の抑制に使用される．データの特徴量のfeature_fraction * 100 % だけ使用する．
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

d_params = {
    "objective": "binary",
    "metric": "auc",
    "seed": 0,
    "verbose": -1,
    "learning_rate": 0.1,
    "num_leaves": 100,
    "min_data_in_leaf": 1000,
}

_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    "seed": 0,
    "verbose": -1,
    "learning_rate": 0.1,
    "num_leaves": 100,
    "min_data_in_leaf": 1000,
}


def aug(arr: List[np.ndarray]) -> np.ndarray:
    return np.mean(arr, axis=0).flatten()
def enc(t: int):
    return t
@lru_cache(maxsize=None)
def load_df(path: Path) -> pd.DataFrame:
    return pd.read_csv(path).set_index("object_id", drop=False)
@lru_cache(maxsize=None)
def load_dict(path: Path) -> Dict[str, List[np.ndarray]]:
    with open(path, "rb") as f:
        _dict = pickle.load(f)
        return _dict

In [49]:
scores = list()
best_params = list()
for suffix in ["3fold0", "3fold1", "3fold2", "all"]:
    if suffix != "all":
        cv_list = list()
    train_df = load_df(root/f"{suffix}_train.csv")
    test_df = load_df(root/f"{suffix}_test.csv")
    train_target = np.array([enc(train_df.loc[object_id]["target"]) for object_id in train_df["object_id"]]).reshape(-1, 1)
    if suffix != "all":
        test_target = np.array([enc(test_df.loc[object_id]["target"]) for object_id in test_df["object_id"]]).reshape(-1, 1)
        test_features = list()
    train_features = list()
    for name in names:
        train_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_train_features2.pkl")
        test_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_test_features2.pkl")
        train_features.append( np.array([aug(train_dict[object_id]) for object_id in train_df["object_id"]]) )
        if suffix != "all":
            test_features.append( np.array([aug(test_dict[object_id]) for object_id in test_df["object_id"]]))
    train_features = np.concatenate(train_features, axis=1)
    train_dataset = lgb.Dataset(train_features, label=train_target)
    if suffix != "all":
        test_features = np.concatenate(test_features, axis=1)
        test_dataset = lgb.Dataset(test_features, label=test_target)
        model = lgb.train(
            params,
            train_set=train_dataset,
            valid_sets=(train_dataset, test_dataset),
            num_boost_round=10000,
            early_stopping_rounds=10,
            verbose_eval=10
        )
        pred = model.predict(test_features)
        scores.append( mse(test_target, pred)**.5 )
    if suffix == "all":
        score = str(np.mean(scores))[:6]
        print(f"\x1b[31;1m{score=}\x1b[m")



You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 10 rounds
[10]	training's l2: 0.364928	valid_1's l2: 0.623319
[20]	training's l2: 0.153189	valid_1's l2: 0.549771
[30]	training's l2: 0.074018	valid_1's l2: 0.546577
Early stopping, best iteration is:
[26]	training's l2: 0.0966657	valid_1's l2: 0.544543




You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 10 rounds
[10]	training's l2: 0.364437	valid_1's l2: 0.627013
[20]	training's l2: 0.153969	valid_1's l2: 0.55545
[30]	training's l2: 0.0762862	valid_1's l2: 0.548834
Early stopping, best iteration is:
[28]	training's l2: 0.0864575	valid_1's l2: 0.547576




You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 10 rounds
[10]	training's l2: 0.365616	valid_1's l2: 0.619597
[20]	training's l2: 0.152112	valid_1's l2: 0.539377
[30]	training's l2: 0.0727451	valid_1's l2: 0.536046
Early stopping, best iteration is:
[25]	training's l2: 0.102783	valid_1's l2: 0.532702
[31;1mscore='0.7359'[m


Training until validation scores don't improve for 10 rounds
[10]	training's l2: 0.366158	valid_1's l2: 0.647864
[20]	training's l2: 0.256479	valid_1's l2: 0.640012
Early stopping, best iteration is:
[15]	training's l2: 0.293257	valid_1's l2: 0.636124




In [48]:
model.best_iteration

25

In [26]:
y_true = np.array([3, -0.5, 2, 7])
y_pred = np.array([2.5, 0.0, 2, 8])
sum((y_true-y_pred)**2)/4

0.375

2.23606797749979