In [11]:
from typing import *
from pathlib import Path
import numpy as np
import pandas as pd
import lightgbm as lgb
import pickle
from time import time
from functools import lru_cache
from sklearn.metrics import mean_squared_error as mse
import logging
logger = logging.getLogger()
logger.disabled = True


root = Path("/data/natsuki/dataset_atmaCup11")

names = ["atma11simple_j4e5nofreeze", "atma11sortingdate_j4e5nofreeze", "atma11onehot_j4e5nofreeze", "atma11materialstechniques_j4e5nofreeze", "atma11sortingdate_j4nofreeze"]
epoch = 10

# https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9, #学習の高速化と過学習の抑制に使用される．データの特徴量のfeature_fraction * 100 % だけ使用する．
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'force_col_wise': True,
    "verbose": -1, # suppress warning
}

d_params = {
    "objective": "binary",
    "metric": "auc",
    "seed": 0,
    "verbose": -1,
    "learning_rate": 0.1,
    "num_leaves": 100,
    "min_data_in_leaf": 1000,
}

_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    "seed": 0,
    "verbose": -1,
    "learning_rate": 0.1,
    "num_leaves": 100,
    "min_data_in_leaf": 1000,
}


def aug(
    arr: List[np.ndarray],
    funcs="mean",
    ) -> np.ndarray:
    return np.concatenate(
        [
            getattr(np, func)(arr, axis=0).flatten()
            for func in funcs.split(",")
        ]
    )
def enc(t: int):
    return t
@lru_cache(maxsize=None)
def load_df(path: Path) -> pd.DataFrame:
    return pd.read_csv(path).set_index("object_id", drop=False)
@lru_cache(maxsize=None)
def load_dict(path: Path) -> Dict[str, List[np.ndarray]]:
    with open(path, "rb") as f:
        _dict = pickle.load(f)
        return _dict
def post_process(pred: np.ndarray) -> np.ndarray:
    return pred.clip(0, 3)
    

In [13]:
process = "clip"
funcs = "mean"
coef = 1
num_boost_round=50
comment=f"add_sortdatej4"


scores = list()
stops = list()
for suffix in ["3fold0", "3fold1", "3fold2", "all"]:
    train_df = load_df(root/f"{suffix}_train.csv")
    test_df = load_df(root/f"{suffix}_test.csv")
    train_target = np.array([enc(train_df.loc[object_id]["target"]) for object_id in train_df["object_id"]]).reshape(-1, 1)
    if suffix != "all":
        test_target = np.array([enc(test_df.loc[object_id]["target"]) for object_id in test_df["object_id"]]).reshape(-1, 1)
    test_features = list()
    train_features = list()
    for name in names:
        train_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_train_features2.pkl")
        test_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_test_features2.pkl")
        train_features.append( np.array([aug(train_dict[object_id]) for object_id in train_df["object_id"]]) )
        test_features.append( np.array([aug(test_dict[object_id]) for object_id in test_df["object_id"]]))
    train_features = np.concatenate(train_features, axis=1)
    test_features = np.concatenate(test_features, axis=1)
    train_dataset = lgb.Dataset(train_features, label=train_target)
    if suffix != "all":
        print(f" Start CV {suffix} ".center(140, "#"))
        test_dataset = lgb.Dataset(test_features, label=test_target)
        model = lgb.train(
            params,
            train_set=train_dataset,
            valid_sets=(train_dataset, test_dataset),
            num_boost_round=10000,
            early_stopping_rounds=10,
            verbose_eval=1
        )
        pred = model.predict(test_features)
        pred = post_process(pred)
        scores.append( mse(test_target, pred)**.5 )
        stops.append( model.best_iteration )
        print(f" Finished CV {suffix} {model.best_iteration=} ".center(140, "#"))
    if suffix == "all":
        print(f" Start Pred {suffix} ".center(140, "#"))
        score = str(np.mean(scores))[:7]
        stop = int(np.mean(stops)*coef)
        model = lgb.train(
            params,
            train_set=train_dataset,
            valid_sets=(train_dataset,),
            num_boost_round=stop,
            early_stopping_rounds=None,
            verbose_eval=1
        )
        pred = model.predict(test_features)
        pred = post_process(pred)
        pred_df = pd.DataFrame(pred, columns=["target"])
        fn = f"{str(int(time()))[-5:]}_{comment}_coef{coef}_stop{stop}_{funcs}_{process}_{score}".replace(".", "")+".csv"
        pred_df.to_csv(f"./submissions/{fn}", index=False)
        print(f" Finished {score=} {fn} ".center(140, "#"))

atma11simple_j4e5nofreeze
atma11sortingdate_j4e5nofreeze
atma11onehot_j4e5nofreeze
atma11materialstechniques_j4e5nofreeze
atma11sortingdate_j4nofreeze
############################################################# Start CV 3fold0 ##############################################################
[1]	training's l2: 0.850247	valid_1's l2: 0.886157
Training until validation scores don't improve for 10 rounds
[2]	training's l2: 0.771788	valid_1's l2: 0.839623
[3]	training's l2: 0.701399	valid_1's l2: 0.798679
[4]	training's l2: 0.637274	valid_1's l2: 0.762543
[5]	training's l2: 0.579431	valid_1's l2: 0.73108
[6]	training's l2: 0.527213	valid_1's l2: 0.703947
[7]	training's l2: 0.480055	valid_1's l2: 0.680481
[8]	training's l2: 0.437388	valid_1's l2: 0.660004
[9]	training's l2: 0.398865	valid_1's l2: 0.641006
[10]	training's l2: 0.365841	valid_1's l2: 0.624316
[11]	training's l2: 0.333795	valid_1's l2: 0.610099
[12]	training's l2: 0.305088	valid_1's l2: 0.598483
[13]	training's l2: 0.279204	vali

In [100]:
model.best_iteration

25

In [80]:
str(int(time()))[-5:]

'07688'

In [73]:
12*60*60

43200

In [107]:
f" hoge ".center(140, "#")

'################################################################### hoge ###################################################################'