In [1]:
# Baseline
import os
import pickle

import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from src.config import Config

cfg = Config.get_cnf()

# Load the data
train_df = pl.read_csv(cfg.data.train_path, try_parse_dates=True)
anime_df = pl.read_csv(cfg.data.anime_path, try_parse_dates=True)
test_df = pl.read_csv(cfg.data.test_path, try_parse_dates=True)

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=5, random_state=cfg.seed, shuffle=True)

# Create a column for the fold number
train_df = train_df.with_columns(pl.lit(-1).alias("fold"))

# Assign the fold number to each row
for fold, (_, val_index) in enumerate(skf.split(train_df, train_df["score"])):
    train_df[val_index, "fold"] = fold


# Merge the train data with the anime meta data
train_merged = train_df.join(anime_df, how="left", on="anime_id")

# Prepare the test data
test_merged = test_df.join(anime_df, how="left", on="anime_id")

# Encode the categorical variables
cat_cols = ["user_id", "anime_id", "type", "source", "rating"]
les = []
for col in cat_cols:
    le = LabelEncoder()
    le.fit(pl.concat([train_merged[col], test_merged[col]]).fill_null(""))
    encoded_train_col = le.transform(train_merged[col])
    encoded_test_col = le.transform(test_merged[col])
    train_merged = train_merged.with_columns(pl.Series(col, encoded_train_col).alias(col))
    test_merged = test_merged.with_columns(pl.Series(col, encoded_test_col).alias(col))
    les.append(le)

# Training and evaluation with LightGBM
scores_lgb = []
models_lgb = []

for fold in range(5):
    print(f"Training for fold {fold}...")

    # Prepare the train and validation data
    train_data = train_merged.filter(pl.col("fold") != fold)
    val_data = train_merged.filter(pl.col("fold") == fold)

    # Define the features and the target
    features = [
        "user_id",
        "anime_id",
        "type",
        "source",
        "rating",
        "members",
        "watching",
        "completed",
        "on_hold",
        "dropped",
        "plan_to_watch",
    ]
    target = "score"

    # Prepare the LightGBM datasets
    lgb_train = lgb.Dataset(train_data[features].to_pandas(), train_data[target].to_pandas())
    lgb_val = lgb.Dataset(val_data[features].to_pandas(), val_data[target].to_pandas())

    # Train the model
    callbacks = [lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(period=20)]
    model_lgb = lgb.train(dict(cfg.lgb), lgb_train, valid_sets=[lgb_val], callbacks=callbacks)

    # Save the model
    with open(f"model_lgb_{fold}.pkl", "wb") as f:
        pickle.dump(model_lgb, f)

    # Predict the validation data
    val_pred_lgb = model_lgb.predict(val_data[features].to_pandas(), num_iteration=model_lgb.best_iteration)

    # Evaluate the model
    score_lgb = np.sqrt(mean_squared_error(val_data[target].to_pandas(), val_pred_lgb))
    scores_lgb.append(score_lgb)

    print(f"RMSE for fold {fold}: {score_lgb}")

# Calculate the average score
average_score_lgb = np.mean(scores_lgb)

print(f"Average RMSE: {average_score_lgb}")

# Predict the test data and create the submission file
submission_df = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
submission_df = submission_df.with_columns(pl.lit(0).alias("score"))

for fold in range(5):
    with open(f"model_lgb_{fold}.pkl", "rb") as f:
        model_lgb = pickle.load(f)
    test_pred_lgb = model_lgb.predict(test_merged[features].to_pandas(), num_iteration=model_lgb.best_iteration)
    submission_df = submission_df.with_columns((pl.col("score") + pl.Series(test_pred_lgb) / 5).alias("score"))

submission_df.write_csv(os.path.join(cfg.data.output_path, "submission_baseline.csv"))


Training for fold 0...
Training until validation scores don't improve for 20 rounds
[20]	valid_0's rmse: 1.53026
[40]	valid_0's rmse: 1.50732
[60]	valid_0's rmse: 1.49032
[80]	valid_0's rmse: 1.4778
[100]	valid_0's rmse: 1.46868
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.46868
RMSE for fold 0: 1.4686820628595298
Training for fold 1...
Training until validation scores don't improve for 20 rounds
[20]	valid_0's rmse: 1.5309
[40]	valid_0's rmse: 1.50843
[60]	valid_0's rmse: 1.49171
[80]	valid_0's rmse: 1.47937
[100]	valid_0's rmse: 1.47043
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.47043
RMSE for fold 1: 1.470428115439867
Training for fold 2...
Training until validation scores don't improve for 20 rounds
[20]	valid_0's rmse: 1.5305
[40]	valid_0's rmse: 1.50793
[60]	valid_0's rmse: 1.49123
[80]	valid_0's rmse: 1.47913
[100]	valid_0's rmse: 1.47052
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.47052
RMSE for