In [87]:
from typing import *
from pathlib import Path
import numpy as np
import pandas as pd
import lightgbm as lgb
import pickle
from time import time
from functools import lru_cache
from sklearn.metrics import mean_squared_error as mse
import logging
import scipy
from scipy.special import softmax
from IPython.display import clear_output


root = Path("/data/natsuki/dataset_atmaCup11")

names = ["atma11simple_j4e5nofreeze", "atma11sortingdate_j4e5nofreeze", "atma11onehot_j4e5nofreeze", "atma11materialstechniques_j4e5nofreeze"]
epoch = 10

# https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9, #学習の高速化と過学習の抑制に使用される．データの特徴量のfeature_fraction * 100 % だけ使用する．
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'force_col_wise': True,
    "verbose": -1, # suppress warning
}

def aug(
    arr: List[np.ndarray],
    funcs="mean",
    object_id=None,
    ) -> np.ndarray:
    seq = list()
    if funcs[:4] == "all,":
        seq += [np.array(arr).flatten()]
        funcs = funcs[4:]
    seq += [ getattr(np, func)(arr, axis=0).flatten() for func in funcs.split(",") ]
    return np.concatenate(seq)
def enc(t: int):
    return t
@lru_cache(maxsize=None)
def load_df(path: Path) -> pd.DataFrame:
    return pd.read_csv(path).set_index("object_id", drop=False)
@lru_cache(maxsize=None)
def load_dict(path: Path) -> Dict[str, List[np.ndarray]]:
    with open(path, "rb") as f:
        _dict = pickle.load(f)
        return _dict
def post_process(
    pred: np.ndarray,
    suffix=None,
    test_df=None,
    thr=None,
    ratio=None,
    width=None,
    ) -> np.ndarray:
    simple_dict = load_dict(root/"checkpoints"/f"atma11simple_j4e5nofreeze_{suffix}"/f"epoch10_{suffix}_test_features2.pkl")
    simple_np = np.array([ np.mean(simple_dict[object_id], axis=0) for object_id in test_df["object_id"] ]).flatten()
    pred = ratio*pred + (1-ratio)*simple_np

    onehot_dict = load_dict(root/"checkpoints"/f"atma11onehot_j4e5nofreeze_{suffix}"/f"epoch10_{suffix}_test_features2.pkl")
    onehot_np = np.array([ np.mean(onehot_dict[object_id], axis=0).reshape(4) for object_id in test_df["object_id"] ])
    for i in range(4):
        for j in np.where(onehot_np[:, i] > thr)[0]:
            pred[j] = i
    for i in range(len(pred)):
        for j in range(4):
            if j-width < pred[i] < j+width:
                pred[i] = j
                break
    return pred.clip(0, 3)
    

In [88]:
process = "clip"
funcs = "mean,max,min,var"
coef = 1
thr = 0.94
ratio = 0.5
width = 0 # 0.5を超えるのはナンセンス 意味なかった…

scores = list()
stops = list()
for suffix in ["3fold0", "3fold1", "3fold2", "all"]:
    train_df = load_df(root/f"{suffix}_train.csv")
    test_df = load_df(root/f"{suffix}_test.csv")
    train_target = np.array([enc(train_df.loc[object_id]["target"]) for object_id in train_df["object_id"]]).reshape(-1, 1)
    if suffix != "all":
        test_target = np.array([enc(test_df.loc[object_id]["target"]) for object_id in test_df["object_id"]]).reshape(-1, 1)
    test_features = list()
    train_features = list()
    for name in names:
        train_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_train_features2.pkl")
        test_dict = load_dict(root/"checkpoints"/f"{name}_{suffix}"/f"epoch{epoch}_{suffix}_test_features2.pkl")
        train_features.append( np.array([aug(train_dict[object_id], funcs, object_id) for object_id in train_df["object_id"]]) )
        test_features.append( np.array([aug(test_dict[object_id], funcs, object_id) for object_id in test_df["object_id"]]))
    train_features = np.concatenate(train_features, axis=1)
    test_features = np.concatenate(test_features, axis=1)
    train_dataset = lgb.Dataset(train_features, label=train_target)
    if suffix != "all":
        print(f" Start CV {suffix} ".center(50, "#"))
        test_dataset = lgb.Dataset(test_features, label=test_target)
        model = lgb.train(
            params,
            train_set=train_dataset,
            valid_sets=(train_dataset, test_dataset),
            num_boost_round=10000,
            early_stopping_rounds=10,
            verbose_eval=1
        )
        pred = model.predict(test_features)
        pred = post_process(pred, suffix=suffix, test_df=test_df, thr=thr, ratio=ratio, width=width) # XXX
        scores.append( mse(test_target, pred)**.5 )
        stops.append( model.best_iteration )
        print(f" Finished CV {suffix} {model.best_iteration=} ".center(50, "#"))
    if suffix == "all":
        print(f" Start Pred {suffix} ".center(50, "#"))
        score = np.mean(scores)
        stop = int(np.mean(stops)*coef)
        model = lgb.train(
            params,
            train_set=train_dataset,
            valid_sets=(train_dataset,),
            num_boost_round=stop,
            early_stopping_rounds=None,
            verbose_eval=1
        )
        pred = model.predict(test_features)
        pred = post_process(pred, suffix=suffix, test_df=test_df, thr=thr, ratio=ratio, width=width) #XXX
        pred_df = pd.DataFrame(pred, columns=["target"])
        fn = f"{str(int(time()))[-5:]}_{str(score)[:6]}".replace(".", "")+".csv"
        pred_df.to_csv(f"./submissions/{fn}", index=False)
        clear_output()
        print(f" Finished {fn} ".center(50, "#"))
old_score = score

######## Finished 0.7174 31007_07174.csv #########
