In [1]:
import pickle

import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from src.config import cfg
from src.data import anime_id_label_encoding, load_data, user_id_label_encoding
from src.dir import create_dir
from src.seed import seed_everything

pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(50)
pl.Config.set_tbl_cols(100)

seed_everything(cfg.seed)


## 使用する最低限のデータを準備


In [2]:
train_df, test_df, anime_df = load_data()

# testをseen, unseenで分ける
train_user_list = train_df["user_id"].unique().to_list()
seen_test_df = test_df.filter(pl.col("user_id").is_in(train_user_list))
unseen_test_df = test_df.filter(~pl.col("user_id").is_in(train_user_list))


## CV

In [8]:
# seen用のCV
skf = StratifiedKFold(n_splits=cfg.n_splits, shuffle=True, random_state=cfg.seed)

train_df = train_df.with_columns(fold=pl.lit(-1))

for fold, (_, val_index) in enumerate(skf.split(train_df, train_df["user_id"])):
    train_df[val_index, "fold"] = fold

scores_lgb = []
models_lgb = []
feature_importances = []

for fold in range(cfg.n_splits):
    print(f"Training for fold: {fold}...")

    train_data = train_df.filter(pl.col("fold") != fold)
    val_data = train_df.filter(pl.col("fold") == fold)

    features = test_df.columns
    target = "score"

    lgb_train = lgb.Dataset(train_data[features].to_pandas(), train_data[target].to_pandas())
    lgb_val = lgb.Dataset(val_data[features].to_pandas(), val_data[target].to_pandas())

    callbacks = [
        lgb.early_stopping(stopping_rounds=cfg.lgb.early_stopping_rounds),
        lgb.log_evaluation(cfg.lgb.log_evaluation_period),
    ]
    model_lgb = lgb.train(
        cfg.lgb.params,
        lgb_train,
        valid_sets=[lgb_val],
        callbacks=callbacks,
        num_boost_round=cfg.lgb.num_boost_round,
    )

    # # Save the model
    # with open(f"model_lgb_{fold}.pkl", "wb") as f:
    #     pickle.dump(model_lgb, f)

#     # Predict the validation data
#     val_pred_lgb = model_lgb.predict(val_data[features].to_pandas(), num_iteration=model_lgb.best_iteration)

#     # Evaluate the model
#     score_lgb = np.sqrt(mean_squared_error(val_data[target].to_pandas(), val_pred_lgb))
#     scores_lgb.append(score_lgb)

#     print(f"RMSE for fold {fold}: {score_lgb}")

#     # Save feature importances
#     feature_importances.append(model_lgb.feature_importance(importance_type="gain"))

# # Calculate the average score
# average_score_lgb = np.mean(scores_lgb)

# print(f"Average RMSE: {average_score_lgb}")

# # Calculate the average feature importance
# average_feature_importance = np.mean(feature_importances, axis=0)
# feature_importance_df = pd.DataFrame({"feature": features, "importance": average_feature_importance}).sort_values(
#     by="importance", ascending=False
# )

# print("Feature Importances:")
# print(feature_importance_df)

# # Predict the test data and create the submission file
# submission_df = pl.read_csv("../data/input/sample_submission.csv", try_parse_dates=True)
# submission_df = submission_df.with_columns(pl.lit(0).alias("score"))

# for fold in range(5):
#     with open(f"model_lgb_{fold}.pkl", "rb") as f:
#         model_lgb = pickle.load(f)
#     test_pred_lgb = model_lgb.predict(test_df[features].to_pandas(), num_iteration=model_lgb.best_iteration)
#     submission_df = submission_df.with_columns((pl.col("score") + pl.Series(test_pred_lgb) / 5).alias("score"))

# submission_df.write_csv("../data/output/submission_baseline_a2v_rating.csv")


Training for fold: 0...
Training for fold: 1...
Training for fold: 2...
Training for fold: 3...
Training for fold: 4...




## モデル