In [1]:
import os
from datetime import datetime
from pathlib import Path

import numpy as np
import polars as pl
import pytz
import torch
from datasets import Dataset
from omegaconf import OmegaConf
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedGroupKFold
from transformers.trainer_utils import get_last_checkpoint  # 最新のチェックポイントのパスを取得する関数

from src.config import cfg
from src.data import add_subject_name_info, preprocess_train
from src.dir import create_dir
from src.seed import seed_everything

cfg.exp_number = Path().resolve().name
print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)
pl.Config.set_fmt_str_lengths(1000)


  from .autonotebook import tqdm as notebook_tqdm


exp_number: '002'
run_time: base
data:
  input_root: ../../data/input
  train_path: ../../data/input/train.csv
  test_path: ../../data/input/test.csv
  sample_submission_path: ../../data/input/sample_submission.csv
  mapping_path: ../../data/input/misconception_mapping.csv
  mapping_meta_path: ../../data/input/mapping_meta.parquet
  output_root: ../../data/output
  results_root: ../../results
  results_path: ../../results/002/base
seed: 42
k: 25
model:
  model_name: BAAI/bge-large-en-v1.5
  epoch: 2
  lr: 2.0e-05
  batch_size: 8



polars.config.Config

In [2]:
DEBUG = False
FP = False if torch.cuda.is_bf16_supported() else True
BF = True if torch.cuda.is_bf16_supported() else False
print(f"{torch.cuda.is_bf16_supported()=}")


torch.cuda.is_bf16_supported()=True


### metric

In [3]:
def apk(actual, predicted, k=25):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists
    """

    if not actual:
        return 0.0

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(actual), k)


def mapk(actual, predicted, k=25):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


def recall_at_k(actual, predicted, k=25):
    """
    Computes the recall at k for predictions.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements to evaluate

    Returns
    -------
    score : float
            The mean recall@k over the input lists
    """
    scores = []

    for act, pred in zip(actual, predicted):
        pred_at_k = pred[:k]
        num_correct = len(set(act) & set(pred_at_k))
        recall = num_correct / len(act) if len(act) > 0 else 0.0
        scores.append(recall)

    return np.mean(scores)


### Data Load

In [4]:
# データの読み込み
train = pl.read_csv(cfg.data.train_path, try_parse_dates=True)
test = pl.read_csv(cfg.data.test_path, try_parse_dates=True)
sample_submission = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
mapping = pl.read_csv(cfg.data.mapping_path, try_parse_dates=True)

# CV
gkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=cfg.seed)


In [5]:
# trainの前処理
train_long = preprocess_train(train)
train_long.head(3)


QuestionId,ConstructName,SubjectName,QuestionText,CorrectAnswer,AnswerType,AnswerText,AllText,AnswerAlphabet,QuestionId_Answer,MisconceptionId
i64,str,str,str,str,str,str,str,str,str,i64
0,"""Use the order of operations to carry out calculations involving powers""","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ?""","""A""","""AnswerDText""","""Does not need brackets""","""ConstructName: Use the order of operations to carry out calculations involving powers SubjectName: BIDMAS QuestionText: \[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ? AnswerText: Does not need brackets""","""D""","""0_D""",1672
1000,"""Simplify an algebraic fraction by factorising the numerator""","""Simplifying Algebraic Fractions""","""Simplify the following, if possible: \( \frac{1-t}{t-1} \)""","""B""","""AnswerAText""","""\( t \)""","""ConstructName: Simplify an algebraic fraction by factorising the numerator SubjectName: Simplifying Algebraic Fractions QuestionText: Simplify the following, if possible: \( \frac{1-t}{t-1} \) AnswerText: \( t \)""","""A""","""1000_A""",891
1000,"""Simplify an algebraic fraction by factorising the numerator""","""Simplifying Algebraic Fractions""","""Simplify the following, if possible: \( \frac{1-t}{t-1} \)""","""B""","""AnswerCText""","""\( 1 \)""","""ConstructName: Simplify an algebraic fraction by factorising the numerator SubjectName: Simplifying Algebraic Fractions QuestionText: Simplify the following, if possible: \( \frac{1-t}{t-1} \) AnswerText: \( 1 \)""","""C""","""1000_C""",891


In [6]:
# 下記の処理はなしでいいかも

# # mappingにSubjectNameの情報を追加
# mapping = add_subject_name_info(train, mapping)

# mapping.head()

# # NOTE: submit時は下記のようにtestの情報も使う → これでCVによる学習時と同じ条件になる
# # train_test = pl.concat([train, test], how="diagonal")
# # mapping = add_subject_name_info(train_test, mapping)


### Make retrieval data

In [7]:
def make_retrieval_data(train_long, mapping, model, k):
    # 問題文をベクトル化
    train_long_vec = model.encode(train_long["AllText"].to_list(), normalize_embeddings=True)

    # 誤概念をベクトル化
    misconception_mapping_vec = model.encode(mapping["MisconceptionName"].to_list(), normalize_embeddings=True)
    # misconception_mapping_vec = model.encode(
    #     mapping["MisconceptionName_with_SubjectNames"].to_list(), normalize_embeddings=True
    # )

    # 問題文と誤概念のコサイン類似度を計算
    train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
    # コサイン類似度が高い順にソート
    train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

    # 各問題に対してk個の予測誤概念IDを追加
    train_long = train_long.with_columns(
        pl.Series(train_sorted_indices[:, :k].tolist()).alias("PredictMisconceptionId")
    )

    # 予測誤概念の情報を結合
    train_retrieved = (
        # 予測誤概念IDリストを展開
        train_long.explode("PredictMisconceptionId")
        # 正解の誤概念情報を結合
        .join(mapping, on="MisconceptionId")
        # 予測の誤概念情報を結合(カラム名に"Predict"を付与)
        .join(mapping.rename(lambda x: "Predict" + x), on="PredictMisconceptionId")
        # 正解と予測が一致する行を削除
        .filter(pl.col("MisconceptionId") != pl.col("PredictMisconceptionId"))
    )

    return train_retrieved


### Fine-tuning

In [8]:
# 実験結果格納用のディレクトリを作成
japan_tz = pytz.timezone("Asia/Tokyo")
cfg.run_time = datetime.now(japan_tz).strftime("%Y%m%d_%H%M%S")

map_scores = []
recall_scores = []

for i, (train_idx, valid_idx) in enumerate(
    gkf.split(train_long, groups=train_long["QuestionId"], y=train_long["SubjectName"])
):
    save_dir = os.path.join(cfg.data.results_path, f"fold{i+1}")
    create_dir(save_dir)

    model = SentenceTransformer(cfg.model.model_name)

    train_retrieved = make_retrieval_data(train_long[train_idx], mapping, model, cfg.k)
    valid_retrieved = make_retrieval_data(train_long[valid_idx], mapping, model, cfg.k)
    train_dataset = Dataset.from_polars(train_retrieved)
    valid_dataset = Dataset.from_polars(valid_retrieved)
    if DEBUG:
        train_dataset = train_dataset.select(range(50))
        valid_dataset = valid_dataset.select(range(50))
    loss = MultipleNegativesRankingLoss(model)

    print(f"{cfg.model.model_name}のfine-tuningを開始します。({i+1}/{gkf.n_splits}fold)")

    args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir=save_dir,
        # Optional training parameters:
        num_train_epochs=cfg.model.epoch,
        per_device_train_batch_size=cfg.model.batch_size,
        gradient_accumulation_steps=128 // cfg.model.batch_size,
        per_device_eval_batch_size=cfg.model.batch_size,
        eval_accumulation_steps=128 // cfg.model.batch_size,
        learning_rate=cfg.model.lr,
        weight_decay=0.01,
        warmup_ratio=0.1,
        fp16=FP,  # Set to False if you get an error that your GPU can't run on FP16
        bf16=BF,  # Set to True if you have a GPU that supports BF16
        batch_sampler=BatchSamplers.NO_DUPLICATES,  # no duplicate samples in a batch
        # Optional tracking/debugging parameters:
        lr_scheduler_type="cosine_with_restarts",
        evaluation_strategy="steps",
        eval_steps=0.1,
        save_strategy="steps",
        save_steps=0.1,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        logging_steps=100,
        # report_to=REPORT_TO,  # Will be used in W&B if `wandb` is installed
        # run_name=EXP_NAME,
        do_eval=True,
    )

    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset.select_columns(["AllText", "MisconceptionName", "PredictMisconceptionName"]),
        # train_dataset=train_dataset.select_columns(
        #     ["AllText", "MisconceptionName_with_SubjectNames", "PredictMisconceptionName_with_SubjectNames"]
        # ),
        eval_dataset=valid_dataset.select_columns(["AllText", "MisconceptionName", "PredictMisconceptionName"]),
        loss=loss,
    )

    trainer.train()
    model.save_pretrained(save_dir, create_model_card=False)

    # 評価

    valid_long = train_long[valid_idx]
    # 問題文をベクトル化
    valid_long_vec = model.encode(valid_long["AllText"].to_list(), normalize_embeddings=True)
    # 誤概念をベクトル化
    misconception_mapping_vec = model.encode(mapping["MisconceptionName"].to_list(), normalize_embeddings=True)
    # 問題文と誤概念のコサイン類似度を計算
    valid_cos_sim_arr = cosine_similarity(valid_long_vec, misconception_mapping_vec)
    # コサイン類似度が高い順にソート
    valid_sorted_indices = np.argsort(-valid_cos_sim_arr, axis=1)
    # 各問題に対してk個の予測誤概念IDを追加
    valid_long = valid_long.with_columns(
        pl.Series(valid_sorted_indices[:, : cfg.k].tolist()).alias("PredictMisconceptionId")
    )

    actual_misconception_ids = [[mis_id] for mis_id in valid_long["MisconceptionId"].to_list()]
    predicted_misconception_ids = valid_long["PredictMisconceptionId"].to_list()
    # map@25
    map_score = mapk(actual_misconception_ids, predicted_misconception_ids, k=25)
    map_scores.append(map_score)

    # recall@25
    recall_score = recall_at_k(actual_misconception_ids, predicted_misconception_ids, k=25)
    recall_scores.append(recall_score)

    print(f"\n================ fold{i+1} result================\n")
    print(f"map@25: {map_score}")
    print(f"recall@25: {recall_score}")


print("\n================CV result================\n")
print(f"map@25: {np.mean(map_scores)}")
print(f"recall@25: {np.mean(recall_scores)}")




Directory created: ../../results/002/20241115_190315/fold1
BAAI/bge-large-en-v1.5のfine-tuningを開始します。(1/5fold)




Step,Training Loss,Validation Loss
135,1.6803,1.686817


Error while generating model card:                                   
Traceback (most recent call last):
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py", line 1233, in _create_model_card
    model_card = generate_model_card(self)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/model_card.py", line 962, in generate_model_card
    model_card = ModelCard.from_template(card_data=model.model_card_data, template_path=template_path, hf_emoji="🤗")
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/huggingface_hub/repocard.py", line 416, in from_template
    return super().from_template(card_data, template_path, template_str, **template_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

### HuggingFaceにpush

In [9]:
# huggingfaceにpush
from huggingface_hub import HfFolder

# Hugging Faceのトークンを設定
HfFolder.save_token("your_huggingface_token")

# 各foldのモデルをプッシュ
for fold in range(1, 6):  # 1から5までのfold
    save_dir = os.path.join(cfg.data.results_path, f"fold{fold}")

    # 保存されたモデルを読み込み
    model = SentenceTransformer(save_dir)

    # モデルをHugging Faceにpush
    model.push_to_hub(
        f"marumarukun/{cfg.model.model_name}_fine_tuned_fold{fold}_{cfg.run_time}",
        commit_message=f"Add fold{fold} SentenceTransformer model",
    )
    print(f"Fold {fold} モデルのプッシュが完了しました。")
