# #15 atmaCup 正則化の強さを変えたときの精度 (iALS)

比較対象：

- SVD (正則化なし)
- SVD (正則化あり)  ← 本notebookはこれ
- NMF (正則化なし)
- NMF (正則化あり)
- 行列分解なし

---

## セットアップ

In [1]:
import numpy as np
import pandas as pd
import pathlib
import pickle
import lightgbm as lgb

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
anime = pd.read_csv('data/anime.csv', na_values=['Unknown'])

## 前処理

### 共通の前処理

In [None]:
train_anime = train.merge(anime, how='left', on='anime_id')
test_anime = test.merge(anime, how='left', on='anime_id')
traintest_anime = pd.concat([train_anime, test_anime], ignore_index=True)

### ユーザーとアニメの分散表現の獲得

ここでは、iALSのハイパーパラメーター調整の重要性を説いた以下の論文の[再現コード](https://github.com/google-research/google-research/blob/master/ials/ncf_benchmarks/ials.py)からiASLの実装を抜粋して用いる。  
[Rendle, Steffen, et al. "Revisiting the performance of ials on item recommendation benchmarks." RecSys 2022.](https://dl.acm.org/doi/10.1145/3523227.3548486)

In [None]:
# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import concurrent.futures
import numpy as np
np.random.seed(0)

class IALSDataSet():
    def __init__(self, train_by_user, train_by_item, num_batches):
        self.train_by_user = train_by_user
        self.train_by_item = train_by_item
        self.num_users = len(train_by_user)
        self.num_items = len(train_by_item)
        self.user_batches = self._batch(train_by_user, num_batches)
        self.item_batches = self._batch(train_by_item, num_batches)

    def _batch(self, xs, num_batches):
        batches = [[] for _ in range(num_batches)]
        for i, x in enumerate(xs):
            batches[i % num_batches].append(x)
        return batches

class IALS():
    def __init__(self, num_users, num_items, embedding_dim, reg, unobserved_weight, stddev):
        self.embedding_dim = embedding_dim
        self.reg = reg
        self.unobserved_weight = unobserved_weight
        self.user_embedding = np.random.normal(0, stddev, (num_users, embedding_dim))
        self.item_embedding = np.random.normal(0, stddev, (num_items, embedding_dim))
        self._update_user_gramian()
        self._update_item_gramian()

    def _update_user_gramian(self):
        self.user_gramian = np.matmul(self.user_embedding.T, self.user_embedding)

    def _update_item_gramian(self):
        self.item_gramian = np.matmul(self.item_embedding.T, self.item_embedding)

    def score(self, user_history):
        user_emb = project(user_history, self.item_embedding, self.item_gramian, self.reg, self.unobserved_weight)
        result = np.dot(user_emb, self.item_embedding.T)
        return result

    def train(self, ds):
        self._solve(ds.user_batches, is_user=True)
        self._update_user_gramian()
        self._solve(ds.item_batches, is_user=False)
        self._update_item_gramian()

    def _solve(self, batches, is_user):
        if is_user:
            embedding = self.user_embedding
            args = (self.item_embedding, self.item_gramian, self.reg, self.unobserved_weight)
        else:
            embedding = self.item_embedding
            args = (self.user_embedding, self.user_gramian, self.reg, self.unobserved_weight)
        results = map_parallel(solve, batches, *args)
        for r in results:
            for user, emb in r.items():
                embedding[user, :] = emb

def map_parallel(fn, xs, *args):
    if len(xs) == 1:
        return [fn(xs[0], *args)]
    num_threads = len(xs)
    executor = concurrent.futures.ProcessPoolExecutor(num_threads)
    futures = [executor.submit(fn, x, *args) for x in xs]
    concurrent.futures.wait(futures)
    results = [future.result() for future in futures]
    return results

def solve(data_by_user, item_embedding, item_gramian, global_reg, unobserved_weight):
    user_embedding = {}
    for user, items in data_by_user:
        reg = global_reg *(len(items) + unobserved_weight * item_embedding.shape[0])
        user_embedding[user] = project(items, item_embedding, item_gramian, reg, unobserved_weight)
    return user_embedding

def project(user_history, item_embedding, item_gramian, reg, unobserved_weight):
    if not user_history:
        raise ValueError("empty user history in projection")
    emb_dim = np.shape(item_embedding)[1]
    lhs = np.zeros([emb_dim, emb_dim])
    rhs = np.zeros([emb_dim])
    for item in user_history:
        item_emb = item_embedding[item]
        lhs += np.outer(item_emb, item_emb)
        rhs += item_emb
    lhs += unobserved_weight * item_gramian
    lhs = lhs + np.identity(emb_dim) * reg
    return np.linalg.solve(lhs, rhs)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm as tqdm

class UserAnimeIALSTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, traintest_anime, embedding_dim, reg, unobserved_weight, stddev, num_epochs, num_batches):
        self.traintest_anime = traintest_anime
        self.embedding_dim = embedding_dim
        self.reg = reg
        self.unobserved_weight = unobserved_weight
        self.stddev = stddev
        self.num_epochs = num_epochs
        self.num_batches = num_batches

    def fit(self, X, y=None):
        le_user = LabelEncoder().fit(self.traintest_anime['user_id'])
        le_item = LabelEncoder().fit(self.traintest_anime['anime_id'])

        train_by_user = (
            self.traintest_anime
            .assign(user_id=lambda df: le_user.transform(df['user_id']))
            .assign(anime_id=lambda df: le_item.transform(df['anime_id']))
            .groupby('user_id')['anime_id'].apply(list)
            .pipe(lambda s: list(zip(s.index, s)))
        )
        train_by_item = (
            self.traintest_anime
            .assign(user_id=lambda df: le_user.transform(df['user_id']))
            .assign(anime_id=lambda df: le_item.transform(df['anime_id']))
            .groupby('anime_id')['user_id'].apply(list)
            .pipe(lambda s: list(zip(s.index, s)))
        )

        train_ds = IALSDataSet(train_by_user, train_by_item, self.num_batches)
        self.ials = IALS(
            num_users=train_ds.num_users,
            num_items=train_ds.num_items,
            embedding_dim=self.embedding_dim,
            reg=self.reg,
            unobserved_weight=self.unobserved_weight,
            stddev=self.stddev
        )

        for epoch in tqdm(range(self.num_epochs)):
            self.ials.train(train_ds)
        self.user_embedding = pd.DataFrame(
            data=self.ials.user_embedding,
            index=pd.Index(le_user.classes_, name='user_id'),
            columns=[f'user_svd_{i:04d}' for i in range(self.embedding_dim)]
        )
        self.item_embedding = pd.DataFrame(
            data=self.ials.item_embedding,
            index=pd.Index(le_item.classes_, name='anime_id'),
            columns=[f'item_svd_{i:04d}' for i in range(self.embedding_dim)]
        )

        return self

    def transform(self, X):
        target_columns = ['user_id', 'anime_id']
        X_new = (
            X[target_columns]
            .merge(self.user_embedding, how='left', on='user_id')
            .merge(self.item_embedding, how='left', on='anime_id')
            .drop(columns=target_columns)
            .to_numpy()
        )

        return X_new

    def get_feature_names_out(self, input_features=None):
        names = np.concatenate([
            self.user_embedding.columns.to_numpy(),
            self.item_embedding.columns.to_numpy()
        ])

        return names

## 交差検証

In [None]:
from sklearn.model_selection import KFold, GroupKFold

# テストデータと同じように学習データに存在しないユーザーの評価結果が約23％を占めるようデータを分割する。
class UnknownUserKFold:
    def __init__(self, n_splits_cv, n_splits_uu):
        self.n_splits_cv = n_splits_cv
        self.n_splits_uu = n_splits_uu

    def split(self, X, y=None, groups=None):
        splits_cv = KFold(n_splits=self.n_splits_cv, shuffle=True, random_state=0).split(X)
        splits_uu = GroupKFold(n_splits=self.n_splits_uu).split(X, groups=groups)
        for fold in range(self.n_splits_cv):
            train_index, test_index = next(splits_cv)
            _, uu_index = next(splits_uu)
            train_index = np.setdiff1d(train_index, uu_index)
            test_index = np.union1d(test_index, uu_index)

            yield train_index, test_index

In [None]:
grid_search_results = []
for embedding_dim in [25, 50, 100, 200]:
    for reg in [0.1, 0.03, 0.01, 0.003, 0.001, 0.0003]:
        for unobserved_weight in [1, 0.3, 0.1, 0.03, 0.01, 0.003]:
            transformer_path = pathlib.Path(f'temp/transformer_emb{embedding_dim:03d}_reg{reg:.4f}_uw{unobserved_weight:.3f}_ep08.pkl')
            if transformer_path.exists():
                with open(transformer_path, 'rb') as f:
                    transformer = pickle.load(f)
            else:
                transformer = UserAnimeIALSTransformer(
                    traintest_anime,
                    embedding_dim=embedding_dim,
                    reg=reg,
                    unobserved_weight=unobserved_weight,
                    stddev = 0.1 / np.sqrt(embedding_dim),
                    num_epochs=8,
                    num_batches=1
                )
                transformer = transformer.set_output(transform='pandas')
                transformer = transformer.fit(train_anime)
                with open(transformer_path, 'wb') as f:
                    pickle.dump(transformer, f)

            x_train = transformer.transform(train_anime)
            y_train = train['score']
            user_id_train = train['user_id']

            kf = UnknownUserKFold(n_splits_cv=5, n_splits_uu=18)
            for fold, (train_index, test_index) in enumerate(kf.split(x_train, groups=user_id_train)):
                cv_x_train = x_train.iloc[train_index, :]
                cv_y_train = y_train.iloc[train_index]
                cv_x_test = x_train.iloc[test_index, :]
                cv_y_test = y_train.iloc[test_index]

                model = lgb.train(
                    params={
                        'objective': 'regression',
                        'verbose': -1,
                        'metric': 'rmse',
                        'learning_rate': 0.05,
                        'num_leaves': 100,
                        'feature_fraction': 0.7,
                        'seed': 127
                    },
                    train_set=lgb.Dataset(cv_x_train, label=cv_y_train),
                    valid_sets=[lgb.Dataset(cv_x_test, label=cv_y_test)],
                    num_boost_round=20000,
                    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
                )

                grid_search_result = {}
                grid_search_result['embedding_dim'] = embedding_dim
                grid_search_result['reg'] = reg
                grid_search_result['unobserved_weight'] = unobserved_weight
                grid_search_result['fold'] = fold
                grid_search_result['rmse'] = model.best_score['valid_0']['rmse']
                grid_search_results.append(grid_search_result)
                print(f"embedding_dim: {embedding_dim}, reg: {reg}, unobserved_weight: {unobserved_weight}, fold: {fold}, RMSE: {grid_search_result['rmse']}")

                pd.DataFrame(grid_search_results).to_csv('temp/grid_search_results_ials.csv', index=False, header=True)

grid_search_results = pd.DataFrame(grid_search_results)

In [2]:
grid_search_results = pd.read_csv('temp/grid_search_results_ials.csv')

(
    grid_search_results
    .groupby(['embedding_dim', 'reg', 'unobserved_weight'])['rmse'].mean()
    .unstack()
    .style.background_gradient(vmin=1.18, vmax=1.22)
)

Unnamed: 0_level_0,unobserved_weight,0.003000,0.010000,0.030000,0.100000,0.300000,1.000000
embedding_dim,reg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25,0.0003,1.22564,1.215758,1.204021,1.200756,1.19764,1.204123
25,0.001,1.225073,1.21258,1.204235,1.200167,1.196042,1.201374
25,0.003,1.218592,1.214389,1.207886,1.201907,1.198833,1.201022
25,0.01,1.210282,1.208359,1.201208,1.198672,1.196886,1.202452
25,0.03,1.216207,1.204781,1.199634,1.202038,1.210001,1.253001
25,0.1,1.263551,1.244577,1.231482,1.244686,1.27704,1.413997
50,0.0003,1.219732,1.210798,1.205805,1.199107,1.192917,1.193108
50,0.001,1.217781,1.211925,1.20107,1.195596,1.196014,1.193147
50,0.003,1.208363,1.207722,1.196979,1.1964,1.191708,1.193002
50,0.01,1.204562,1.200411,1.195655,1.191127,1.189812,1.198186
