# #15 atmaCup 正則化の強さを変えたときの精度 (NMF)

比較対象：

- SVD (正則化なし)
- SVD (正則化あり)
- NMF (正則化なし)  ← 本notebookはこれ
- NMF (正則化あり)  ← 本notebookはこれ
- 行列分解なし

---

## セットアップ

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
anime = pd.read_csv('data/anime.csv', na_values=['Unknown'])

## 前処理

### 共通の前処理

In [None]:
train_anime = train.merge(anime, how='left', on='anime_id')
test_anime = test.merge(anime, how='left', on='anime_id')
traintest_anime = pd.concat([train_anime, test_anime], ignore_index=True)

### ユーザーとアニメの分散表現の獲得

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF

class UserAnimeNMFTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, traintest_anime, n_components, alpha_W):
        self.traintest_anime = traintest_anime
        self.n_components = n_components
        self.alpha_W = alpha_W

    def fit(self, X, y=None):
        le_user = LabelEncoder()
        le_item = LabelEncoder()
        count = (
            self.traintest_anime
            .assign(
                user_id=lambda df: le_user.fit_transform(df['user_id']),
                anime_id=lambda df: le_item.fit_transform(df['anime_id']),
                count=1
            )
        )
        user_item_matrix = csr_matrix((count['count'], (count['user_id'], count['anime_id'])))

        nmf = NMF(n_components=self.n_components, alpha_W=self.alpha_W, max_iter=1000, random_state=0)
        user_embedding = nmf.fit_transform(user_item_matrix)
        user_embedding = pd.DataFrame(
            user_embedding,
            index=pd.Index(le_user.classes_, name='user_id'),
            columns=[f'user_nmf_{i:02d}' for i in range(self.n_components)]
        )
        self.user_embedding = user_embedding

        item_embedding = nmf.components_.T
        item_embedding = pd.DataFrame(
            item_embedding,
            index=pd.Index(le_item.classes_, name='anime_id'),
            columns=[f'item_nmf_{i:02d}' for i in range(self.n_components)]
        )
        self.item_embedding = item_embedding

        return self

    def transform(self, X):
        target_columns = ['user_id', 'anime_id']
        X_new = (
            X[target_columns]
            .merge(self.user_embedding, how='left', on='user_id')
            .merge(self.item_embedding, how='left', on='anime_id')
            .drop(columns=target_columns)
            .to_numpy()
        )

        return X_new

    def get_feature_names_out(self, input_features=None):
        names = np.concatenate([
            self.user_embedding.columns.to_numpy(),
            self.item_embedding.columns.to_numpy()
        ])

        return names

## 交差検証

In [None]:
from sklearn.model_selection import KFold, GroupKFold

# テストデータと同じように学習データに存在しないユーザーの評価結果が約23％を占めるようデータを分割する。
class UnknownUserKFold:
    def __init__(self, n_splits_cv, n_splits_uu):
        self.n_splits_cv = n_splits_cv
        self.n_splits_uu = n_splits_uu

    def split(self, X, y=None, groups=None):
        splits_cv = KFold(n_splits=self.n_splits_cv, shuffle=True, random_state=0).split(X)
        splits_uu = GroupKFold(n_splits=self.n_splits_uu).split(X, groups=groups)
        for fold in range(self.n_splits_cv):
            train_index, test_index = next(splits_cv)
            _, uu_index = next(splits_uu)
            train_index = np.setdiff1d(train_index, uu_index)
            test_index = np.union1d(test_index, uu_index)

            yield train_index, test_index

In [None]:
grid_search_results = []
for n_components in [25, 50, 100, 200]:
    for alpha_W in [0, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003]:
        transformer = UserAnimeNMFTransformer(
            traintest_anime=traintest_anime,
            n_components=n_components,
            alpha_W=alpha_W
        )
        transformer = transformer.set_output(transform='pandas')

        x_train = transformer.fit_transform(train_anime)
        y_train = train['score']
        user_id_train = train['user_id']

        kf = UnknownUserKFold(n_splits_cv=5, n_splits_uu=18)
        for fold, (train_index, test_index) in enumerate(kf.split(x_train, groups=user_id_train)):
            cv_x_train = x_train.iloc[train_index, :]
            cv_y_train = y_train.iloc[train_index]
            cv_x_test = x_train.iloc[test_index, :]
            cv_y_test = y_train.iloc[test_index]

            model = lgb.train(
                params={
                    'objective': 'regression',
                    'verbose': -1,
                    'metric': 'rmse',
                    'learning_rate': 0.05,
                    'num_leaves': 100,
                    'feature_fraction': 0.7,
                    'seed': 127
                },
                train_set=lgb.Dataset(cv_x_train, label=cv_y_train),
                valid_sets=[lgb.Dataset(cv_x_test, label=cv_y_test)],
                num_boost_round=20000,
                callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
            )

            grid_search_result = {}
            grid_search_result['n_components'] = n_components
            grid_search_result['alpha_W'] = alpha_W
            grid_search_result['fold'] = fold
            grid_search_result['rmse'] = model.best_score['valid_0']['rmse']
            grid_search_results.append(grid_search_result)
            print(f"n_components: {n_components}, alpha_W: {alpha_W}, fold: {fold}, RMSE: {grid_search_result['rmse']}")

            pd.DataFrame(grid_search_results).to_csv('temp/grid_search_results_nmf_temp.csv', index=False, header=True)

grid_search_results = pd.DataFrame(grid_search_results)

In [2]:
grid_search_results = pd.read_csv('temp/grid_search_results_nmf.csv')

(
    grid_search_results
    .groupby(['n_components', 'alpha_W'])['rmse'].mean()
    .unstack()
    .style.background_gradient(vmin=1.18, vmax=1.22)
)

alpha_W,0.000000,0.000300,0.001000,0.003000,0.010000,0.030000,0.100000
n_components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25,1.209895,1.209583,1.210081,1.206764,1.204748,1.232027,1.451748
50,1.202833,1.206084,1.207553,1.208667,1.198688,1.232849,1.426319
100,1.209216,1.202897,1.203727,1.199313,1.20012,1.225843,1.374419
200,1.212248,1.206905,1.202992,1.204649,1.195074,1.223843,1.357148
