user_idとanime_idのみを特徴量として使用するサンプル

In [9]:
import os

os.makedirs("feature", exist_ok=True)
os.makedirs("cache", exist_ok=True)


In [10]:
USER_COUNT = 1998
ANIME_COUNT = 2000


In [11]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd


def load():
    train_df = pd.read_csv("atmaCup15_dataset/train.csv")
    test_df = pd.read_csv("atmaCup15_dataset/test.csv")
    anime_df = pd.read_csv("atmaCup15_dataset/anime.csv")

    train_test_df = pd.concat([train_df, test_df])

    # idのエンコード
    user_id_encoder = LabelEncoder()
    user_id_encoder.fit(train_test_df["user_id"])
    anime_id_encoder = LabelEncoder()
    anime_id_encoder.fit(anime_df["anime_id"])

    # エンコード
    train_df["user_id"] = user_id_encoder.transform(train_df["user_id"])
    train_df["anime_id"] = anime_id_encoder.transform(train_df["anime_id"])
    test_df["user_id"] = user_id_encoder.transform(test_df["user_id"])
    test_df["anime_id"] = anime_id_encoder.transform(test_df["anime_id"])
    anime_df["anime_id"] = anime_id_encoder.transform(anime_df["anime_id"])

    return train_df, test_df, anime_df, user_id_encoder, anime_id_encoder


In [12]:
import pandas as pd
import pickle
import os


def to_csv(df: pd.DataFrame, path: str):
    df.to_csv(path, encoding="utf-8-sig")


def left_join(left_df: pd.DataFrame, right_df: pd.DataFrame, on: str):
    return left_df.merge(right_df, on=on, how="left")


def make_cache(name: str, make_func, use_cache: bool = True):
    if not use_cache:
        print(f"**** not use cache {name} ****")

    path = f"cache/{name}.pkl"
    if use_cache and os.path.exists(path):
        return pickle.load(open(path, "rb"))
    ret = make_func()
    pickle.dump(ret, open(path, "wb"))
    return ret


def make_cache_and_csv(name: str, make_func, use_cache: bool = True):
    ret = make_cache(name, make_func, use_cache=use_cache)
    to_csv(ret, f"feature/{name}.csv")
    return ret


# 特徴量

In [13]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import datetime
import time


# 各ユーザーの見たアニメをone_hot_encoding
def encode_anime_id(train_test_df: pd.DataFrame):
    return (
        pd.concat(
            [
                train_test_df["user_id"],
                pd.get_dummies(train_test_df["anime_id"]),
            ],
            axis=1,
        )
        .groupby("user_id")
        .sum()
        .add_prefix("ohe_anime_id_")
        .reset_index()
    )


# 各アニメの見たユーザーをone_hot_encoding
def encode_user_id(train_test_df: pd.DataFrame):
    return (
        pd.concat(
            [
                train_test_df["anime_id"],
                pd.get_dummies(train_test_df["user_id"]),
            ],
            axis=1,
        )
        .groupby("anime_id")
        .sum()
        .add_prefix("ohe_user_id_")
        .reset_index()
    )


class StaticFeature:
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        anime_df: pd.DataFrame,
        use_user_rating: bool,
    ):
        self.category_columns = []

        train_test_df = pd.concat([train_df, test_df])

        def make(name: str, make_func, use_cache: bool = True):
            return make_cache_and_csv(name=name, make_func=make_func, use_cache=use_cache)

        # idエンコード
        self.encoded_anime_id = make("encoded_anime_id", lambda: encode_anime_id(train_test_df=train_test_df))
        self.encoded_user_id = make("encoded_user_id", lambda: encode_user_id(train_test_df=train_test_df))

        # ユーザー用とアニメ用で結合しておく
        self.user_feature_df = pd.DataFrame([i for i in range(USER_COUNT)], columns=["user_id"])
        self.anime_feature_df = pd.DataFrame([i for i in range(ANIME_COUNT)], columns=["anime_id"])

        def join_user(df: pd.DataFrame):
            self.user_feature_df = left_join(self.user_feature_df, df, on="user_id")

        def join_anime(df: pd.DataFrame):
            self.anime_feature_df = left_join(self.anime_feature_df, df, on="anime_id")

        # idエンコード
        join_user(self.encoded_anime_id)
        join_anime(self.encoded_user_id)

    def apply(self, train_df: pd.DataFrame):
        df = train_df
        df = left_join(df, self.user_feature_df, on="user_id")
        df = left_join(df, self.anime_feature_df, on="anime_id")
        return df


In [14]:
from sklearn.metrics import mean_squared_error


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5


# モデル

In [15]:
import pandas as pd
import numpy as np
import lightgbm as lgb


class Model:
    def __init__(self, use_user_rating: bool):
        self.lgb_model = None
        self.use_user_rating = use_user_rating

    def __add_feature(self, df: pd.DataFrame, sf: StaticFeature):
        df = sf.apply(df)

        # idの削除
        df = df.drop("user_id", axis=1)
        df = df.drop("anime_id", axis=1)

        return df

    def train(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        anime_df: pd.DataFrame,
        sf: StaticFeature,
        fold: int,
    ):
        X_train_df = train_df[["user_id", "anime_id"]]
        X_test_df = test_df[["user_id", "anime_id"]]
        y_train_df = train_df["score"]
        y_test_df = test_df["score"]

        # 特徴量を追加
        X_train_df = self.__add_feature(
            X_train_df,
            sf=sf,
        )
        X_test_df = self.__add_feature(
            X_test_df,
            sf=sf,
        )

        category_columns = sf.category_columns

        # パラメータ
        params = {
            "objective": "regression",
            "metric": "rmse",
            "extra_trees": True,
            "learning_rate": 0.01,
            "num_leaves": 63,
            "feature_fraction": 0.8,
            "bagging_freq": 1,
            "bagging_fraction": 0.8,
            "random_state": 0,
            # "verbose": -1,
        }
        print(f"params: {params}")

        lgb_train = lgb.Dataset(X_train_df, y_train_df)
        lgb_test = lgb.Dataset(X_test_df, y_test_df)

        stopping_rounds = 500
        callbacks = [
            lgb.early_stopping(
                stopping_rounds=stopping_rounds,
                verbose=True,
            ),
            lgb.log_evaluation(100),
        ]

        lgb_model = lgb.train(
            params,
            lgb_train,
            categorical_feature=category_columns,
            valid_sets=lgb_test,
            num_boost_round=1000000,
            callbacks=callbacks,
        )

        self.lgb_model = lgb_model
        self.train_columns = X_train_df.columns

    def predict(
        self,
        test_df: pd.DataFrame,
        anime_df: pd.DataFrame,
        sf: StaticFeature,
    ):
        X_test_df = test_df[["user_id", "anime_id"]]
        X_test_df = self.__add_feature(X_test_df, sf=sf)
        return self.lgb_model.predict(X_test_df)

    def fit(
        train_df: pd.DataFrame,
        anime_df: pd.DataFrame,
        sf: StaticFeature,
        use_user_rating: bool,
        cv: list[tuple[list[int], list[int]]],
    ):
        models = []
        n_records = len(train_df)
        oof_pred = np.zeros((n_records,), dtype=np.float32)
        rmses = []

        for i, (idx_train, idx_valid) in enumerate(cv):
            div_train_df = train_df.iloc[idx_train].reset_index(drop=True)
            div_test_df = train_df.iloc[idx_valid].reset_index(drop=True)

            model = Model(use_user_rating=use_user_rating)

            model.train(
                train_df=div_train_df,
                test_df=div_test_df,
                anime_df=anime_df,
                sf=sf,
                fold=i,
            )

            pred_i = model.predict(
                div_test_df.drop("score", axis=1),
                anime_df=anime_df,
                sf=sf,
            )
            oof_pred[idx_valid] = pred_i
            models.append(model)
            score = root_mean_squared_error(div_test_df["score"], pred_i)
            print(f" - fold{i} - {score:.4f}")
            rmses.append(score)

        score = root_mean_squared_error(train_df["score"], oof_pred)

        print("=" * 50)
        print(f"FINISHI: Whole Score: {score:.4f}")
        rmses.append(score)

        return oof_pred, models, rmses

    def predicts(
        models,
        test_df: pd.DataFrame,
        anime_df: pd.DataFrame,
        sf: StaticFeature,
    ):
        pred = np.array(
            [
                model.predict(
                    test_df,
                    anime_df=anime_df,
                    sf=sf,
                )
                for model in models
            ]
        )
        pred = np.mean(pred, axis=0)
        pred = np.clip(pred, a_min=1, a_max=10)
        return pred

    def get_importance(models):
        df = pd.DataFrame()
        df["column"] = models[0].train_columns
        df["column_copy"] = [f'"{x}",' for x in models[0].train_columns]
        total = None
        for i, model in enumerate(models):
            column = f"fold_{i}"
            df[column] = model.lgb_model.feature_importance(importance_type="gain")

            if i == 0:
                total = df[column].copy()
            else:
                total += df[column]
        df["total"] = total
        return df


# seenユーザー用の学習

In [16]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import StratifiedKFold


def ready_data():
    # データ
    (
        train_df,
        test_df,
        anime_df,
        user_id_encoder,
        anime_id_encoder,
    ) = load()

    # 特徴量
    sf = StaticFeature(train_df=train_df, test_df=test_df, anime_df=anime_df, use_user_rating=True)

    # cv
    fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=510)
    cv = fold.split(train_df.drop("score", axis=1), train_df["score"])
    cv = list(cv)

    return train_df, test_df, anime_df, sf, cv


def do_train(
    train_df: pd.DataFrame,
    anime_df: pd.DataFrame,
    sf: StaticFeature,
    cv: list[tuple[list[int], list[int]]],
):
    train_predict_scores, models, rmses = Model.fit(
        train_df=train_df,
        anime_df=anime_df,
        sf=sf,
        use_user_rating=True,
        cv=cv,
    )
    return train_predict_scores, models, rmses


def train():
    train_df, test_df, anime_df, sf, cv = ready_data()
    train_predict_scores, models, rmses = do_train(train_df=train_df, anime_df=anime_df, sf=sf, cv=cv)

    pickle.dump(
        (train_predict_scores, models, rmses),
        open("cache/rating_user_model.pkl", "wb"),
    )

    # 重要度
    importance_df = Model.get_importance(models)
    importance_df.to_csv("rating_importance.csv", encoding="utf-8-sig")


if __name__ == "__main__":
    train()


params: {'objective': 'regression', 'metric': 'rmse', 'extra_trees': True, 'learning_rate': 0.01, 'num_leaves': 63, 'feature_fraction': 0.8, 'bagging_freq': 1, 'bagging_fraction': 0.8, 'random_state': 0}
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7906
[LightGBM] [Info] Number of data points in the train set: 109120, number of used features: 3953
[LightGBM] [Info] Start training from score 7.768759
Training until validation scores don't improve for 500 rounds
[100]	valid_0's rmse: 1.41193
[200]	valid_0's rmse: 1.34415
[300]	valid_0's rmse: 1.30062
[400]	valid_0's rmse: 1.26828
[500]	valid_0's rmse: 1.24525
[600]	valid_0's rmse: 1.2296
[700]	valid_0's rmse: 1.21775
[800]	valid_0's rmse: 1.20905
[900]	valid_0's rmse: 1.20264
[1000]	valid_0's rmse: 1.19802
[1100]	valid_0's rmse: 1.19411
[1200]	valid_0's rmse: 1.19098
[1300]	valid_0's rmse: 1.188
[1400]	valid_0's rmse: 1.18577
[1500]	valid_0's rmse: 1.18366
[1600]	valid_0's rmse: 1.18159
[1700]	va

# unseenユーザー用の学習

In [17]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold


def ready_data():
    # データ
    (
        train_df,
        test_df,
        anime_df,
        user_id_encoder,
        anime_id_encoder,
    ) = load()

    # 特徴量
    sf = StaticFeature(train_df=train_df, test_df=test_df, anime_df=anime_df, use_user_rating=False)

    # cv
    gkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=8765)
    cv = list(gkf.split(train_df, train_df["score"], train_df["user_id"]))

    return train_df, test_df, anime_df, sf, cv


def do_train(
    train_df: pd.DataFrame,
    anime_df: pd.DataFrame,
    sf: StaticFeature,
    cv: list[tuple[list[int], list[int]]],
):
    train_predict_scores, models, rmses = Model.fit(
        train_df=train_df,
        anime_df=anime_df,
        sf=sf,
        use_user_rating=False,
        cv=cv,
    )
    return train_predict_scores, models, rmses


def train():
    train_df, test_df, anime_df, sf, cv = ready_data()
    train_predict_scores, models, rmses = do_train(train_df=train_df, anime_df=anime_df, sf=sf, cv=cv)

    pickle.dump(
        (train_predict_scores, models, rmses),
        open("cache/unknown_user_model.pkl", "wb"),
    )

    # 重要度
    importance_df = Model.get_importance(models)
    importance_df.to_csv("unknown_importance.csv", encoding="utf-8-sig")


if __name__ == "__main__":
    train()


params: {'objective': 'regression', 'metric': 'rmse', 'extra_trees': True, 'learning_rate': 0.01, 'num_leaves': 63, 'feature_fraction': 0.8, 'bagging_freq': 1, 'bagging_fraction': 0.8, 'random_state': 0}
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7904
[LightGBM] [Info] Number of data points in the train set: 107816, number of used features: 3952
[LightGBM] [Info] Start training from score 7.775182
Training until validation scores don't improve for 500 rounds
[100]	valid_0's rmse: 1.46401
[200]	valid_0's rmse: 1.42344
[300]	valid_0's rmse: 1.40453
[400]	valid_0's rmse: 1.3939
[500]	valid_0's rmse: 1.38686
[600]	valid_0's rmse: 1.38249
[700]	valid_0's rmse: 1.3799
[800]	valid_0's rmse: 1.37822
[900]	valid_0's rmse: 1.3768
[1000]	valid_0's rmse: 1.37577
[1100]	valid_0's rmse: 1.3754
[1200]	valid_0's rmse: 1.37465
[1300]	valid_0's rmse: 1.37422
[1400]	valid_0's rmse: 1.37378
[1500]	v

# 2つの学習結果をまとめてsubmission.csvを作る

In [18]:
import pandas as pd
import math
import pickle


def make():
    # データ
    train_df, test_df, anime_df, user_id_encoder, anime_id_encoder = load()

    # 特徴量
    sf_rating = StaticFeature(train_df=train_df, test_df=test_df, anime_df=anime_df, use_user_rating=True)
    sf_unknown = StaticFeature(train_df=train_df, test_df=test_df, anime_df=anime_df, use_user_rating=False)

    # モデル読み込み
    train_rating_user_scores, rating_user_models, rating_user_rmses = pickle.load(
        open("cache/rating_user_model.pkl", "rb")
    )
    train_unknown_user_scores, unknown_user_models, unknown_user_rmses = pickle.load(
        open("cache/unknown_user_model.pkl", "rb")
    )

    print("rating_user_rmses")
    for i, rmse in enumerate(rating_user_rmses):
        if i == len(rating_user_rmses) - 1:
            print(f"Total: {rmse}")
        else:
            print(f"Fold {i}: {rmse}")

    print("unknown_user_rmses")
    for i, rmse in enumerate(unknown_user_rmses):
        if i == len(unknown_user_rmses) - 1:
            print(f"Total: {rmse}")
        else:
            print(f"Fold {i}: {rmse}")

    c_unseen = 0.22735307114449846
    c_seen = 1 - c_unseen
    final_rmse = math.sqrt((rating_user_rmses[-1] ** 2) * c_seen + (unknown_user_rmses[-1] ** 2) * c_unseen)
    print(f"final_rmse: {final_rmse}")

    def predict(models, sf):
        return Model.predicts(
            models,
            test_df,
            anime_df=anime_df,
            sf=sf,
        )

    predict_rating_user_scores = predict(rating_user_models, sf_rating)
    predict_unknown_user_scores = predict(unknown_user_models, sf_unknown)

    predict_rating_user_scores = list(predict_rating_user_scores)
    predict_unknown_user_scores = list(predict_unknown_user_scores)

    # 各ユーザーのレーティング数
    user_id_count = {}
    for user_id, count in train_df["user_id"].value_counts().items():
        user_id_count[user_id] = count

    # ユーザーごとにレーティングの有無を調べて使用する予測結果を切り替え
    predict_scores = []
    train_scores = []
    unknown_count = 0
    rating_count = 0
    for index, user_id, anime_id in test_df.itertuples():
        use_rating = False
        if user_id in user_id_count:
            count = user_id_count[user_id]
            if count >= 1:
                use_rating = True

        if use_rating:
            predict_scores.append(predict_rating_user_scores[index])
            train_scores.append(train_rating_user_scores[index])
            rating_count += 1
        else:
            predict_scores.append(predict_unknown_user_scores[index])
            train_scores.append(train_unknown_user_scores[index])
            unknown_count += 1

    submission_df = pd.DataFrame()
    submission_df["score"] = predict_scores
    submission_df.to_csv("submission.csv", index=False)

    total_count = rating_count + unknown_count
    print(f"rating  : {rating_count} / {total_count} {rating_count / total_count * 100}%")
    print(f"unknown : {unknown_count} / {total_count} {unknown_count / total_count * 100}%")


if __name__ == "__main__":
    make()


rating_user_rmses
Fold 0: 1.1100638680528725
Fold 1: 1.1097252114105214
Fold 2: 1.1049176192253898
Fold 3: 1.1129052620129625
Fold 4: 1.1122746037711877
Total: 1.10998087512464
unknown_user_rmses
Fold 0: 1.36922363674894
Fold 1: 1.3651781289734106
Fold 2: 1.389827162363497
Fold 3: 1.3744537957895366
Fold 4: 1.4176099144026255
Total: 1.3829468015701127
final_rmse: 1.1776110315083412
rating  : 90922 / 117676 77.26469288555016%
unknown : 26754 / 117676 22.735307114449846%
