# #15 atmaCup 正則化の強さを変えたときの精度 (iALS) 提出用

比較対象：

- SVD (正則化なし)
- SVD (正則化あり)  ← 本notebookはこれを、最も良かったハイパーパラメーターを使って学習率0.01で学習させなおしたもの
- NMF (正則化なし)
- NMF (正則化あり)
- 行列分解なし

---

## セットアップ

In [None]:
import numpy as np
import pandas as pd
import pathlib
import pickle
import lightgbm as lgb

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
anime = pd.read_csv('data/anime.csv', na_values=['Unknown'])

## 前処理

### 共通の前処理

In [None]:
train_anime = train.merge(anime, how='left', on='anime_id')
test_anime = test.merge(anime, how='left', on='anime_id')
traintest_anime = pd.concat([train_anime, test_anime], ignore_index=True)

### ユーザーとアニメの分散表現の獲得

ここでは、iALSのハイパーパラメーター調整の重要性を説いた以下の論文の[再現コード](https://github.com/google-research/google-research/blob/master/ials/ncf_benchmarks/ials.py)からiASLの実装を抜粋して用いる。  
[Rendle, Steffen, et al. "Revisiting the performance of ials on item recommendation benchmarks." RecSys 2022.](https://dl.acm.org/doi/10.1145/3523227.3548486)

In [None]:
# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import concurrent.futures
import numpy as np
np.random.seed(0)

class IALSDataSet():
    def __init__(self, train_by_user, train_by_item, num_batches):
        self.train_by_user = train_by_user
        self.train_by_item = train_by_item
        self.num_users = len(train_by_user)
        self.num_items = len(train_by_item)
        self.user_batches = self._batch(train_by_user, num_batches)
        self.item_batches = self._batch(train_by_item, num_batches)

    def _batch(self, xs, num_batches):
        batches = [[] for _ in range(num_batches)]
        for i, x in enumerate(xs):
            batches[i % num_batches].append(x)
        return batches

class IALS():
    def __init__(self, num_users, num_items, embedding_dim, reg, unobserved_weight, stddev):
        self.embedding_dim = embedding_dim
        self.reg = reg
        self.unobserved_weight = unobserved_weight
        self.user_embedding = np.random.normal(0, stddev, (num_users, embedding_dim))
        self.item_embedding = np.random.normal(0, stddev, (num_items, embedding_dim))
        self._update_user_gramian()
        self._update_item_gramian()

    def _update_user_gramian(self):
        self.user_gramian = np.matmul(self.user_embedding.T, self.user_embedding)

    def _update_item_gramian(self):
        self.item_gramian = np.matmul(self.item_embedding.T, self.item_embedding)

    def score(self, user_history):
        user_emb = project(user_history, self.item_embedding, self.item_gramian, self.reg, self.unobserved_weight)
        result = np.dot(user_emb, self.item_embedding.T)
        return result

    def train(self, ds):
        self._solve(ds.user_batches, is_user=True)
        self._update_user_gramian()
        self._solve(ds.item_batches, is_user=False)
        self._update_item_gramian()

    def _solve(self, batches, is_user):
        if is_user:
            embedding = self.user_embedding
            args = (self.item_embedding, self.item_gramian, self.reg, self.unobserved_weight)
        else:
            embedding = self.item_embedding
            args = (self.user_embedding, self.user_gramian, self.reg, self.unobserved_weight)
        results = map_parallel(solve, batches, *args)
        for r in results:
            for user, emb in r.items():
                embedding[user, :] = emb

def map_parallel(fn, xs, *args):
    if len(xs) == 1:
        return [fn(xs[0], *args)]
    num_threads = len(xs)
    executor = concurrent.futures.ProcessPoolExecutor(num_threads)
    futures = [executor.submit(fn, x, *args) for x in xs]
    concurrent.futures.wait(futures)
    results = [future.result() for future in futures]
    return results

def solve(data_by_user, item_embedding, item_gramian, global_reg, unobserved_weight):
    user_embedding = {}
    for user, items in data_by_user:
        reg = global_reg *(len(items) + unobserved_weight * item_embedding.shape[0])
        user_embedding[user] = project(items, item_embedding, item_gramian, reg, unobserved_weight)
    return user_embedding

def project(user_history, item_embedding, item_gramian, reg, unobserved_weight):
    if not user_history:
        raise ValueError("empty user history in projection")
    emb_dim = np.shape(item_embedding)[1]
    lhs = np.zeros([emb_dim, emb_dim])
    rhs = np.zeros([emb_dim])
    for item in user_history:
        item_emb = item_embedding[item]
        lhs += np.outer(item_emb, item_emb)
        rhs += item_emb
    lhs += unobserved_weight * item_gramian
    lhs = lhs + np.identity(emb_dim) * reg
    return np.linalg.solve(lhs, rhs)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm as tqdm

class UserAnimeIALSTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, traintest_anime, embedding_dim, reg, unobserved_weight, stddev, num_epochs, num_batches):
        self.traintest_anime = traintest_anime
        self.embedding_dim = embedding_dim
        self.reg = reg
        self.unobserved_weight = unobserved_weight
        self.stddev = stddev
        self.num_epochs = num_epochs
        self.num_batches = num_batches

    def fit(self, X, y=None):
        le_user = LabelEncoder().fit(self.traintest_anime['user_id'])
        le_item = LabelEncoder().fit(self.traintest_anime['anime_id'])

        train_by_user = (
            self.traintest_anime
            .assign(user_id=lambda df: le_user.transform(df['user_id']))
            .assign(anime_id=lambda df: le_item.transform(df['anime_id']))
            .groupby('user_id')['anime_id'].apply(list)
            .pipe(lambda s: list(zip(s.index, s)))
        )
        train_by_item = (
            self.traintest_anime
            .assign(user_id=lambda df: le_user.transform(df['user_id']))
            .assign(anime_id=lambda df: le_item.transform(df['anime_id']))
            .groupby('anime_id')['user_id'].apply(list)
            .pipe(lambda s: list(zip(s.index, s)))
        )

        train_ds = IALSDataSet(train_by_user, train_by_item, self.num_batches)
        self.ials = IALS(
            num_users=train_ds.num_users,
            num_items=train_ds.num_items,
            embedding_dim=self.embedding_dim,
            reg=self.reg,
            unobserved_weight=self.unobserved_weight,
            stddev=self.stddev
        )

        for epoch in tqdm(range(self.num_epochs)):
            self.ials.train(train_ds)
        self.user_embedding = pd.DataFrame(
            data=self.ials.user_embedding,
            index=pd.Index(le_user.classes_, name='user_id'),
            columns=[f'user_svd_{i:04d}' for i in range(self.embedding_dim)]
        )
        self.item_embedding = pd.DataFrame(
            data=self.ials.item_embedding,
            index=pd.Index(le_item.classes_, name='anime_id'),
            columns=[f'item_svd_{i:04d}' for i in range(self.embedding_dim)]
        )

        return self

    def transform(self, X):
        target_columns = ['user_id', 'anime_id']
        X_new = (
            X[target_columns]
            .merge(self.user_embedding, how='left', on='user_id')
            .merge(self.item_embedding, how='left', on='anime_id')
            .drop(columns=target_columns)
            .to_numpy()
        )

        return X_new

    def get_feature_names_out(self, input_features=None):
        names = np.concatenate([
            self.user_embedding.columns.to_numpy(),
            self.item_embedding.columns.to_numpy()
        ])

        return names

## 学習

In [None]:
from sklearn.model_selection import KFold, GroupKFold

# テストデータと同じように学習データに存在しないユーザーの評価結果が約23％を占めるようデータを分割する。
class UnknownUserKFold:
    def __init__(self, n_splits_cv, n_splits_uu):
        self.n_splits_cv = n_splits_cv
        self.n_splits_uu = n_splits_uu

    def split(self, X, y=None, groups=None):
        splits_cv = KFold(n_splits=self.n_splits_cv, shuffle=True, random_state=0).split(X)
        splits_uu = GroupKFold(n_splits=self.n_splits_uu).split(X, groups=groups)
        for fold in range(self.n_splits_cv):
            train_index, test_index = next(splits_cv)
            _, uu_index = next(splits_uu)
            train_index = np.setdiff1d(train_index, uu_index)
            test_index = np.union1d(test_index, uu_index)

            yield train_index, test_index

In [None]:
embedding_dim = 50
reg = 0.01
unobserved_weight = 0.3

transformer_path = pathlib.Path(f'temp/transformer_emb{embedding_dim:03d}_reg{reg:.4f}_uw{unobserved_weight:.3f}_ep16.pkl')
if transformer_path.exists():
    with open(transformer_path, 'rb') as f:
        transformer = pickle.load(f)
else:
    transformer = UserAnimeIALSTransformer(
        traintest_anime,
        embedding_dim=embedding_dim,
        reg=reg,
        unobserved_weight=unobserved_weight,
        stddev = 0.1 / np.sqrt(embedding_dim),
        num_epochs=16,  # 8 -> 16
        num_batches=1
    )
    transformer = transformer.set_output(transform='pandas')
    transformer = transformer.fit(train_anime)
    with open(transformer_path, 'wb') as f:
        pickle.dump(transformer, f)

x_train = transformer.transform(train_anime)
y_train = train['score']
user_id_train = train['user_id']

models = []
scores = []
kf = UnknownUserKFold(n_splits_cv=5, n_splits_uu=18)
for fold, (train_index, test_index) in enumerate(kf.split(x_train, groups=user_id_train)):
    cv_x_train = x_train.iloc[train_index, :]
    cv_y_train = y_train.iloc[train_index]
    cv_x_test = x_train.iloc[test_index, :]
    cv_y_test = y_train.iloc[test_index]

    model = lgb.train(
        params={
            'objective': 'regression',
            'verbose': -1,
            'metric': 'rmse',
            'learning_rate': 0.01,  # 0.05 -> 0.01
            'num_leaves': 100,
            'feature_fraction': 0.7,
            'seed': 127
        },
        train_set=lgb.Dataset(cv_x_train, label=cv_y_train),
        valid_sets=[lgb.Dataset(cv_x_test, label=cv_y_test)],
        num_boost_round=100000,
        callbacks=[lgb.early_stopping(stopping_rounds=500, verbose=True)]  # 100 -> 500
    )
    models.append(model)
    scores.append(model.best_score['valid_0']['rmse'])
    print('')

print(f'cv: {np.mean(scores):.4f} ± {np.std(scores):.4f}')

## 予測

In [None]:
x_test = transformer.transform(test_anime)
y_test_pred = np.mean([model.predict(x_test) for model in models], axis=0)

sub = pd.DataFrame(data={'score': y_test_pred})
sub.loc[sub['score'] < 1, 'score'] = 1
sub.loc[sub['score'] > 10, 'score'] = 10
sub.to_csv(f'submission/submission_reg_ials_best_lr001.csv', index=False, header=True)