In [1]:
import os
import pickle

import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from src.config import cfg
from src.data import anime_id_label_encoding, load_data, user_id_label_encoding
from src.dir import create_dir
from src.seed import seed_everything

pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(50)
pl.Config.set_tbl_cols(100)

seed_everything(cfg.seed)


## 使用する最低限のデータを準備


In [29]:
# load
train_df, test_df, anime_df = load_data()
train_df, test_df = user_id_label_encoding(train_df, test_df)
train_df, test_df, anime_df = anime_id_label_encoding(train_df, test_df, anime_df)


In [None]:
# testをseen, unseenで分ける
train_user_list = train_df["user_id"].unique().to_list()
seen_test_df = test_df.filter(pl.col("user_id").is_in(train_user_list))
unseen_test_df = test_df.filter(~pl.col("user_id").is_in(train_user_list))


## seen CV

In [21]:
# seen用のCV
skf = StratifiedKFold(n_splits=cfg.n_splits, shuffle=True, random_state=cfg.seed)

train_df = train_df.with_columns(fold=pl.lit(-1))

for fold, (_, val_index) in enumerate(skf.split(train_df, train_df["user_id"])):
    train_df[val_index, "fold"] = fold

scores_lgb = []
models_lgb = []
feature_importances = []

for fold in range(cfg.n_splits):
    print(f"Training for fold: {fold}...")

    train_data = train_df.filter(pl.col("fold") != fold)
    val_data = train_df.filter(pl.col("fold") == fold)

    features = test_df.columns
    target = "score"

    lgb_train = lgb.Dataset(train_data[features].to_pandas(), train_data[target].to_pandas())
    lgb_val = lgb.Dataset(val_data[features].to_pandas(), val_data[target].to_pandas())

    callbacks = [
        lgb.early_stopping(stopping_rounds=cfg.lgb.early_stopping_rounds),
        lgb.log_evaluation(cfg.lgb.log_evaluation_period),
    ]
    model_lgb = lgb.train(
        dict(cfg.lgb.params),
        lgb_train,
        valid_sets=[lgb_val],
        callbacks=callbacks,
        num_boost_round=100,
    )

    # Save the model
    seen_model_dir_path = os.path.join(cfg.data.model_path, "lgb", "seen")
    create_dir(seen_model_dir_path)
    with open(f"{seen_model_dir_path}/model_lgb_{fold}.pkl", "wb") as f:
        pickle.dump(model_lgb, f)

    # Predict the validation data
    val_pred_lgb = model_lgb.predict(val_data[features].to_pandas(), num_iteration=model_lgb.best_iteration)

    # Evaluate the model
    score_lgb = np.sqrt(mean_squared_error(val_data[target].to_pandas(), val_pred_lgb))
    scores_lgb.append(score_lgb)

    print(f"RMSE for fold {fold}: {score_lgb}")

    # Save feature importances
    feature_importances.append(model_lgb.feature_importance(importance_type="gain"))

# Calculate the average score
seen_average_score_lgb = np.mean(scores_lgb)

print(f"Average RMSE: {seen_average_score_lgb}")

# Calculate the average feature importance
average_feature_importance = np.mean(feature_importances, axis=0)
feature_importance_df = pd.DataFrame({"feature": features, "importance": average_feature_importance}).sort_values(
    by="importance", ascending=False
)




Training for fold: 0...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.53748
Directory already exists: ../model/lgb/seen
RMSE for fold 0: 1.5374844241544021
Training for fold: 1...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.56279
Directory already exists: ../model/lgb/seen
RMSE for fold 1: 1.5627852306976078
Training for fold: 2...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.53774
Directory already exists: ../model/lgb/seen
RMSE for fold 2: 1.5377358902690097
Training for fold: 3...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.55072
Directory already exists: ../model/lgb/seen
RMSE for fold 3: 1.5507228647179587
Training for fold: 4...


In [30]:
# seen_test_dfに対して予測する（のちほどtest_dfにjoinする）
seen_test_df = seen_test_df.with_columns(pl.lit(0).alias("score"))

for fold in range(cfg.n_splits):
    with open(f"{seen_model_dir_path}/model_lgb_{fold}.pkl", "rb") as f:
        model_lgb = pickle.load(f)
    seen_test_pred_lgb = model_lgb.predict(seen_test_df[features].to_pandas(), num_iteration=model_lgb.best_iteration)
    seen_test_df = seen_test_df.with_columns(pl.col("score") + pl.Series(seen_test_pred_lgb) / cfg.n_splits)


## unseen CV

In [23]:
# unseen用のCV
gkf = GroupKFold(n_splits=cfg.n_splits)

train_df = train_df.with_columns(fold=pl.lit(-1))

for fold, (_, val_index) in enumerate(gkf.split(train_df, groups=train_df["user_id"])):
    train_df[val_index, "fold"] = fold

scores_lgb = []
models_lgb = []
feature_importances = []

for fold in range(cfg.n_splits):
    print(f"Training for fold: {fold}...")

    train_data = train_df.filter(pl.col("fold") != fold)
    val_data = train_df.filter(pl.col("fold") == fold)

    features = test_df.columns
    target = "score"

    lgb_train = lgb.Dataset(train_data[features].to_pandas(), train_data[target].to_pandas())
    lgb_val = lgb.Dataset(val_data[features].to_pandas(), val_data[target].to_pandas())

    callbacks = [
        lgb.early_stopping(stopping_rounds=cfg.lgb.early_stopping_rounds),
        lgb.log_evaluation(cfg.lgb.log_evaluation_period),
    ]
    model_lgb = lgb.train(
        dict(cfg.lgb.params),
        lgb_train,
        valid_sets=[lgb_val],
        callbacks=callbacks,
        num_boost_round=100,
    )

    # Save the model
    unseen_model_dir_path = os.path.join(cfg.data.model_path, "lgb", "unseen")
    create_dir(unseen_model_dir_path)
    with open(f"{unseen_model_dir_path}/model_lgb_{fold}.pkl", "wb") as f:
        pickle.dump(model_lgb, f)

    # Predict the validation data
    val_pred_lgb = model_lgb.predict(val_data[features].to_pandas(), num_iteration=model_lgb.best_iteration)

    # Evaluate the model
    score_lgb = np.sqrt(mean_squared_error(val_data[target].to_pandas(), val_pred_lgb))
    scores_lgb.append(score_lgb)

    print(f"RMSE for fold {fold}: {score_lgb}")

    # Save feature importances
    feature_importances.append(model_lgb.feature_importance(importance_type="gain"))

# Calculate the average score
unseen_average_score_lgb = np.mean(scores_lgb)

print(f"Average RMSE: {unseen_average_score_lgb}")

# Calculate the average feature importance
average_feature_importance = np.mean(feature_importances, axis=0)
feature_importance_df = pd.DataFrame({"feature": features, "importance": average_feature_importance}).sort_values(
    by="importance", ascending=False
)


Training for fold: 0...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.53013
Directory already exists: ../model/lgb/unseen
RMSE for fold 0: 1.530132072103847
Training for fold: 1...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[2]	valid_0's rmse: 1.54732
Directory already exists: ../model/lgb/unseen
RMSE for fold 1: 1.547320451954893
Training for fold: 2...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's rmse: 1.55414
Directory already exists: ../model/lgb/unseen
RMSE for fold 2: 1.5541364323221365
Training for fold: 3...
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.58331
Directory already exists: ../model/lgb/unseen
RMSE for fold 3: 1.5833075385267985
Training for fold: 4.

In [31]:
# unseen_test_dfに対して予測する（のちほどtest_dfにjoinする）
unseen_test_df = unseen_test_df.with_columns(pl.lit(0).alias("score"))

for fold in range(cfg.n_splits):
    with open(f"{unseen_model_dir_path}/model_lgb_{fold}.pkl", "rb") as f:
        model_lgb = pickle.load(f)
    unseen_test_pred_lgb = model_lgb.predict(
        unseen_test_df[features].to_pandas(), num_iteration=model_lgb.best_iteration
    )
    unseen_test_df = unseen_test_df.with_columns(pl.col("score") + pl.Series(unseen_test_pred_lgb) / cfg.n_splits)


## CV値を求める
- seen CVのRSMEとunseen CVのRSMEを二乗して、重みをつけて足してルートを取る

In [41]:
cv_score = np.sqrt((seen_average_score_lgb**2) * 0.77 + (unseen_average_score_lgb**2) * 0.23)

print(f"CV score: {cv_score}")


CV score: 1.5470723894337046


## submissionファイルを作成する

In [38]:
# seen_test_dfとunseen_test_dfを結合してからtest_dfにjoinしてscoresを取得する
seen_unseen_test_df = pl.concat([seen_test_df, unseen_test_df])
sub_score_series = test_df.join(seen_unseen_test_df, on=["user_id", "anime_id"], how="left")["score"]

# Predict the test data and create the submission file
submission_df = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
submission_df = submission_df.with_columns(pl.Series(sub_score_series).alias("score"))

# save submission_df
submission_df.write_csv(cfg.data.output_path + "submission_seen_unseen_cv.csv")
