# Nishika 金融時系列予測

---

## セットアップ

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## 前処理

In [None]:
# Focal loss implementation for LightGBM
# https://maxhalford.github.io/blog/lightgbm-focal-loss/

import numpy as np
from scipy import optimize
from scipy import special

class FocalLoss:
    def __init__(self, gamma, alpha=None):
        self.alpha = alpha
        self.gamma = gamma

    def at(self, y):
        if self.alpha is None:
            return np.ones_like(y)
        return np.where(y, self.alpha, 1 - self.alpha)

    def pt(self, y, p):
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return np.where(y, p, 1 - p)

    def __call__(self, y_true, y_pred):
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        return -at * (1 - pt) ** self.gamma * np.log(pt)

    def grad(self, y_true, y_pred):
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma
        return at * y * (1 - pt) ** g * (g * pt * np.log(pt) + pt - 1)

    def hess(self, y_true, y_pred):
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma

        u = at * y * (1 - pt) ** g
        du = -at * y * g * (1 - pt) ** (g - 1)
        v = g * pt * np.log(pt) + pt - 1
        dv = g * np.log(pt) + g + 1

        return (du * v + u * dv) * y * (pt * (1 - pt))

    def init_score(self, y_true):
        res = optimize.minimize_scalar(
            lambda p: self(y_true, p).sum(),
            bounds=(0, 1),
            method='bounded'
        )
        p = res.x
        log_odds = np.log(p / (1 - p))
        return log_odds

    def lgb_obj(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        return self.grad(y, p), self.hess(y, p)

    def lgb_eval(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        is_higher_better = False
        return 'focal_loss', self(y, p).mean(), is_higher_better

In [None]:
def estimate_loss_importance(train, test, x_columns):
    traintest = pd.concat([
        train.assign(is_test=0),
        test.assign(is_test=1)
    ], ignore_index=True)
    traintest['p_test'] = 0.0

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    fl = FocalLoss(alpha=None, gamma=3)
    for fold, (train_index, test_index) in enumerate(skf.split(traintest, traintest['is_test'])):
        model = lgb.train(
            params = {
                'metric': 'auc',
                'learning_rate': 0.05,
                'feature_fraction': 0.8,
                'verbose': -1
            },
            fobj = fl.lgb_obj,
            num_boost_round = 200,
            train_set = lgb.Dataset(
                data = traintest.loc[train_index, x_columns],
                label = traintest.loc[train_index, 'is_test'],
                init_score = np.full_like(
                    traintest.loc[train_index, 'is_test'],
                    fl.init_score(traintest.loc[train_index, 'is_test']),
                    dtype = float
                )
            ),
            valid_sets = [
                lgb.Dataset(
                    data = traintest.loc[test_index, x_columns],
                    label = traintest.loc[test_index, 'is_test'],
                    init_score = np.full_like(
                        traintest.loc[test_index, 'is_test'],
                        fl.init_score(traintest.loc[test_index, 'is_test']),
                        dtype = float
                    )
                )
            ],
            callbacks = [
                lgb.log_evaluation(10)
            ]
        )
        traintest.loc[test_index, 'p_test'] = special.expit(
            fl.init_score(traintest.loc[train_index, 'is_test']) +
            model.predict(traintest.loc[test_index, x_columns])
        )

    traintest['p_train'] = 1 - traintest['p_test']
    traintest['importance'] = (traintest['p_test'] / test.shape[0]) / (traintest['p_train'] / train.shape[0])

    return train[['id']].merge(traintest[['id', 'importance']], how='left', on='id')['importance'].to_numpy()

In [None]:
train['group'] = train['id'] // 10000
x_columns = train.columns.drop(['id', 'target', 'group']).to_list()

train['importance'] = estimate_loss_importance(train, test, x_columns)

## 学習

In [None]:
models = []
for seed in range(10):
    print(seed)
    train_sample = train.sample(frac=0.8, random_state=seed)
    model = lgb.train(
        params = {
            'objective': 'regression',
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'verbose': -1
        },
        num_boost_round = 400,
        train_set = lgb.Dataset(
            data = train_sample[x_columns],
            label = train_sample['target'],
            weight = train_sample['importance'] ** 1.0
        )
    )
    models.append(model)

## 予測

In [None]:
test_y_pred = np.mean([model.predict(test[x_columns]) for model in models], axis=0)

## 提出用ファイルの作成

In [None]:
pd.DataFrame({'id': test['id'], 'target': test_y_pred}).to_csv('submission.csv', index=False, header=True)