In [28]:
from typing import *
from pathlib import Path
import numpy as np
import pandas as pd
import lightgbm as lgb
import pickle
from time import time
from functools import lru_cache
from sklearn.metrics import mean_squared_error as mse
import logging
logger = logging.getLogger()
logger.disabled = True


root = Path("/data/natsuki/dataset_atmaCup11")

names = ["atma11simple_j4e5nofreeze", "atma11sortingdate_j4e5nofreeze", "atma11onehot_j4e5nofreeze", "atma11materialstechniques_j4e5nofreeze"]
epoch = 10

# https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9, #学習の高速化と過学習の抑制に使用される．データの特徴量のfeature_fraction * 100 % だけ使用する．
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'force_col_wise': True,
    "verbose": -1, # suppress warning
}

d_params = {
    "objective": "binary",
    "metric": "auc",
    "seed": 0,
    "verbose": -1,
    "learning_rate": 0.1,
    "num_leaves": 100,
    "min_data_in_leaf": 1000,
}

_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    "seed": 0,
    "verbose": -1,
    "learning_rate": 0.1,
    "num_leaves": 100,
    "min_data_in_leaf": 1000,
}


def aug(
    arr: List[np.ndarray],
    funcs="mean",
    ) -> np.ndarray:
    seq = list()
    if funcs[:4] == "all,":
        seq += [np.array(arr).flatten()]
        funcs = funcs[4:]
    seq += [ getattr(np, func)(arr, axis=0).flatten() for func in funcs.split(",") ]
    return np.concatenate(seq)
def enc(t: int):
    return t
@lru_cache(maxsize=None)
def load_df(path: Path) -> pd.DataFrame:
    return pd.read_csv(path).set_index("object_id", drop=False)
@lru_cache(maxsize=None)
def load_dict(path: Path) -> Dict[str, List[np.ndarray]]:
    with open(path, "rb") as f:
        _dict = pickle.load(f)
        return _dict
def post_process(pred: np.ndarray) -> np.ndarray:
    return pred.clip(0, 3)
    

In [33]:
process = "clip"
funcs = "mean,max,min,var"
coef = 1.5


scores = list()
stops = list()
for suffix in ["3fold0", "3fold1", "3fold2", "all"]:
    train_df = load_df(root/f"{suffix}_train.csv")
    test_df = load_df(root/f"{suffix}_test.csv")
    train_target = np.array([enc(train_df.loc[object_id]["target"]) for object_id in train_df["object_id"]]).reshape(-1, 1)
    if suffix != "all":
        test_target = np.array([enc(test_df.loc[object_id]["target"]) for object_id in test_df["object_id"]]).reshape(-1, 1)
    test_features = list()
    train_features = list()
    for name in names:
        train_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_train_features2.pkl")
        test_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_test_features2.pkl")
        train_features.append( np.array([aug(train_dict[object_id], funcs) for object_id in train_df["object_id"]]) )
        test_features.append( np.array([aug(test_dict[object_id], funcs) for object_id in test_df["object_id"]]))
    train_features = np.concatenate(train_features, axis=1)
    test_features = np.concatenate(test_features, axis=1)
    train_dataset = lgb.Dataset(train_features, label=train_target)
    if suffix != "all":
        print(f" Start CV {suffix} ".center(50, "#"))
        test_dataset = lgb.Dataset(test_features, label=test_target)
        model = lgb.train(
            params,
            train_set=train_dataset,
            valid_sets=(train_dataset, test_dataset),
            num_boost_round=10000,
            early_stopping_rounds=10,
            verbose_eval=1
        )
        pred = model.predict(test_features)
        pred = post_process(pred)
        scores.append( mse(test_target, pred)**.5 )
        stops.append( model.best_iteration )
        print(f" Finished CV {suffix} {model.best_iteration=} ".center(50, "#"))
    if suffix == "all":
        print(f" Start Pred {suffix} ".center(50, "#"))
        score = str(np.mean(scores))[:6]
        stop = int(np.mean(stops)*coef)
        model = lgb.train(
            params,
            train_set=train_dataset,
            valid_sets=(train_dataset,),
            num_boost_round=stop,
            early_stopping_rounds=None,
            verbose_eval=1
        )
        pred = model.predict(test_features)
        pred = post_process(pred)
        pred_df = pd.DataFrame(pred, columns=["target"])
        fn = f"{str(int(time()))[-5:]}_{score}".replace(".", "")+".csv"
        pred_df.to_csv(f"./submissions/{fn}", index=False)
        print(f" Finished {score=} {fn} ".center(50, "#"))

################ Start CV 3fold0 #################
[1]	training's l2: 0.850057	valid_1's l2: 0.885158
Training until validation scores don't improve for 10 rounds
[2]	training's l2: 0.771922	valid_1's l2: 0.839391
[3]	training's l2: 0.700955	valid_1's l2: 0.798599
[4]	training's l2: 0.636757	valid_1's l2: 0.760997
[5]	training's l2: 0.578775	valid_1's l2: 0.72855
[6]	training's l2: 0.526446	valid_1's l2: 0.700777
[7]	training's l2: 0.479038	valid_1's l2: 0.674628
[8]	training's l2: 0.436536	valid_1's l2: 0.652802
[9]	training's l2: 0.397939	valid_1's l2: 0.634596
[10]	training's l2: 0.362547	valid_1's l2: 0.617081
[11]	training's l2: 0.330677	valid_1's l2: 0.603331
[12]	training's l2: 0.301904	valid_1's l2: 0.59151
[13]	training's l2: 0.275889	valid_1's l2: 0.581339
[14]	training's l2: 0.252611	valid_1's l2: 0.571907
[15]	training's l2: 0.231174	valid_1's l2: 0.56449
[16]	training's l2: 0.211828	valid_1's l2: 0.558924
[17]	training's l2: 0.194282	valid_1's l2: 0.55413
[18]	training's l

25

In [80]:
str(int(time()))[-5:]

'07688'

In [73]:
12*60*60

43200

In [107]:
f" hoge ".center(140, "#")

'################################################################### hoge ###################################################################'

In [24]:
np.array([np.array([1,2]), np.array([3,4])]).flatten()

array([1, 2, 3, 4])

In [27]:
"all,mean"[4:]

'mean'

In [34]:
import pandas as pd
import numpy as np
path = "/data/natsuki/dataset_atmaCup11/checkpoints/atma11simple_j4e5nofreeze_all/epoch5_all_test_features.csv"
df = pd.read_csv(path)
sub = df.drop(columns="object_id").rename(columns=lambda x: "target").clip(lower=0, upper=3)

In [35]:
sub

Unnamed: 0,target
0,1.476527
1,2.046889
2,2.239879
3,2.203802
4,0.263124
...,...
5914,2.943335
5915,1.080175
5916,1.852636
5917,1.994541
