# #15 atmaCup 正則化の強さを変えたときの精度 (行列分解なし) 提出用

比較対象：

- SVD (正則化なし)
- SVD (正則化あり)
- NMF (正則化なし)
- NMF (正則化あり)
- 行列分解なし  ← 本notebookはこれを、学習率0.01で学習させなおしたもの

---

## セットアップ

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
anime = pd.read_csv('data/anime.csv', na_values=['Unknown'])

## 前処理

### 共通の前処理

In [None]:
train_anime = train.merge(anime, how='left', on='anime_id')
test_anime = test.merge(anime, how='left', on='anime_id')
traintest_anime = pd.concat([train_anime, test_anime], ignore_index=True)

### ユーザーとアニメの分散表現の獲得

In [None]:
user_anime = (
    traintest_anime[['user_id', 'anime_id']]
    .assign(one=1)
    .pivot(index='user_id', columns='anime_id', values='one')
    .fillna(0)
)
anime_user = (
    traintest_anime[['user_id', 'anime_id']]
    .assign(one=1)
    .pivot(index='anime_id', columns='user_id', values='one')
    .fillna(0)
)

x_train = (
    train
    .merge(user_anime, how='left', on='user_id')
    .merge(anime_user, how='left', on='anime_id')
    .drop(columns=['score', 'user_id', 'anime_id'])
)
y_train = train['score']
user_id_train = train['user_id']

## 学習

In [None]:
from sklearn.model_selection import KFold, GroupKFold

# テストデータと同じように学習データに存在しないユーザーの評価結果が約23％を占めるようデータを分割する。
class UnknownUserKFold:
    def __init__(self, n_splits_cv, n_splits_uu):
        self.n_splits_cv = n_splits_cv
        self.n_splits_uu = n_splits_uu

    def split(self, X, y=None, groups=None):
        splits_cv = KFold(n_splits=self.n_splits_cv, shuffle=True, random_state=0).split(X)
        splits_uu = GroupKFold(n_splits=self.n_splits_uu).split(X, groups=groups)
        for fold in range(self.n_splits_cv):
            train_index, test_index = next(splits_cv)
            _, uu_index = next(splits_uu)
            train_index = np.setdiff1d(train_index, uu_index)
            test_index = np.union1d(test_index, uu_index)

            yield train_index, test_index

In [None]:
models = []
scores = []
kf = UnknownUserKFold(n_splits_cv=5, n_splits_uu=18)
for fold, (train_index, test_index) in enumerate(kf.split(x_train, groups=user_id_train)):
    cv_x_train = x_train.iloc[train_index, :]
    cv_y_train = y_train.iloc[train_index]
    cv_x_test = x_train.iloc[test_index, :]
    cv_y_test = y_train.iloc[test_index]

    model = lgb.train(
        params={
            'objective': 'regression',
            'verbose': -1,
            'metric': 'rmse',
            'learning_rate': 0.01,  # 0.05 -> 0.01
            'num_leaves': 100,
            'feature_fraction': 0.7,
            'seed': 127
        },
        train_set=lgb.Dataset(cv_x_train, label=cv_y_train),
        valid_sets=[lgb.Dataset(cv_x_test, label=cv_y_test)],
        num_boost_round=100000,
        callbacks=[lgb.early_stopping(stopping_rounds=500, verbose=True)]  # 100 -> 500
    )
    models.append(model)
    scores.append(model.best_score['valid_0']['rmse'])
    print('')

print(f'cv: {np.mean(scores):.4f} ± {np.std(scores):.4f}')

## 予測

In [None]:
x_test = (
    test
    .merge(user_anime, how='left', on='user_id')
    .merge(anime_user, how='left', on='anime_id')
    .drop(columns=['user_id', 'anime_id'])
)
y_test_pred = np.mean([model.predict(x_test) for model in models], axis=0)

sub = pd.DataFrame(data={'score': y_test_pred})
sub.loc[sub['score'] < 1, 'score'] = 1
sub.loc[sub['score'] > 10, 'score'] = 10
sub.to_csv(f'submission/submission_reg_raw_best_lr001.csv', index=False, header=True)