In [1]:
import gc
import os
from datetime import datetime
from pathlib import Path

import numpy as np
import polars as pl
import pytz
import torch
from datasets import Dataset
from omegaconf import OmegaConf
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedGroupKFold
from transformers.trainer_utils import get_last_checkpoint  # 最新のチェックポイントのパスを取得する関数

from src.config import cfg
from src.data import add_subject_name_info, preprocess_train
from src.dir import create_dir
from src.metric import mapk, recall_at_k
from src.seed import seed_everything

cfg.exp_number = Path().resolve().name
print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)
pl.Config.set_fmt_str_lengths(1000)


  from .autonotebook import tqdm as notebook_tqdm


exp_number: '002'
run_time: base
data:
  input_root: ../../data/input
  train_path: ../../data/input/train.csv
  test_path: ../../data/input/test.csv
  sample_submission_path: ../../data/input/sample_submission.csv
  mapping_path: ../../data/input/misconception_mapping.csv
  mapping_meta_path: ../../data/input/mapping_meta.parquet
  output_root: ../../data/output
  results_root: ../../results
  results_path: ../../results/002/base
seed: 42
k: 25
n_splits: 4
model:
  model_name: BAAI/bge-large-en-v1.5
  epoch: 2
  lr: 2.0e-05
  batch_size: 8



polars.config.Config

In [2]:
DEBUG = False
FP = False if torch.cuda.is_bf16_supported() else True
BF = True if torch.cuda.is_bf16_supported() else False
print(f"{torch.cuda.is_bf16_supported()=}")


torch.cuda.is_bf16_supported()=True


### Data Load

In [4]:
# データの読み込み
train = pl.read_csv(cfg.data.train_path, try_parse_dates=True)
test = pl.read_csv(cfg.data.test_path, try_parse_dates=True)
sample_submission = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
mapping = pl.read_csv(cfg.data.mapping_path, try_parse_dates=True)

# CV
gkf = StratifiedGroupKFold(n_splits=cfg.n_splits, shuffle=True, random_state=cfg.seed)


In [5]:
# trainの前処理
train_long = preprocess_train(train)
train_long.head(3)


QuestionId,ConstructName,SubjectName,QuestionText,CorrectAnswer,AnswerType,AnswerText,AllText,AnswerAlphabet,QuestionId_Answer,MisconceptionId
i64,str,str,str,str,str,str,str,str,str,i64
0,"""Use the order of operations to carry out calculations involving powers""","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ?""","""A""","""AnswerDText""","""Does not need brackets""","""ConstructName: Use the order of operations to carry out calculations involving powers SubjectName: BIDMAS QuestionText: \[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ? AnswerText: Does not need brackets""","""D""","""0_D""",1672
1000,"""Simplify an algebraic fraction by factorising the numerator""","""Simplifying Algebraic Fractions""","""Simplify the following, if possible: \( \frac{1-t}{t-1} \)""","""B""","""AnswerAText""","""\( t \)""","""ConstructName: Simplify an algebraic fraction by factorising the numerator SubjectName: Simplifying Algebraic Fractions QuestionText: Simplify the following, if possible: \( \frac{1-t}{t-1} \) AnswerText: \( t \)""","""A""","""1000_A""",891
1000,"""Simplify an algebraic fraction by factorising the numerator""","""Simplifying Algebraic Fractions""","""Simplify the following, if possible: \( \frac{1-t}{t-1} \)""","""B""","""AnswerCText""","""\( 1 \)""","""ConstructName: Simplify an algebraic fraction by factorising the numerator SubjectName: Simplifying Algebraic Fractions QuestionText: Simplify the following, if possible: \( \frac{1-t}{t-1} \) AnswerText: \( 1 \)""","""C""","""1000_C""",891


In [6]:
# 下記の処理はなしでいいかも

# # mappingにSubjectNameの情報を追加
# mapping = add_subject_name_info(train, mapping)

# mapping.head()

# # NOTE: submit時は下記のようにtestの情報も使う → これでCVによる学習時と同じ条件になる
# # train_test = pl.concat([train, test], how="diagonal")
# # mapping = add_subject_name_info(train_test, mapping)


### Make retrieval data

In [7]:
def make_retrieval_data(train_long, mapping, model, k):
    # 問題文をベクトル化
    train_long_vec = model.encode(train_long["AllText"].to_list(), normalize_embeddings=True)

    # 誤概念をベクトル化
    misconception_mapping_vec = model.encode(mapping["MisconceptionName"].to_list(), normalize_embeddings=True)
    # misconception_mapping_vec = model.encode(
    #     mapping["MisconceptionName_with_SubjectNames"].to_list(), normalize_embeddings=True
    # )

    # 問題文と誤概念のコサイン類似度を計算
    train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
    # コサイン類似度が高い順にソート
    train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

    # 各問題に対してk個の予測誤概念IDを追加
    train_long = train_long.with_columns(
        pl.Series(train_sorted_indices[:, :k].tolist()).alias("PredictMisconceptionId")
    )

    # 予測誤概念の情報を結合
    train_retrieved = (
        # 予測誤概念IDリストを展開
        train_long.explode("PredictMisconceptionId")
        # 正解の誤概念情報を結合
        .join(mapping, on="MisconceptionId")
        # 予測の誤概念情報を結合(カラム名に"Predict"を付与)
        .join(mapping.rename(lambda x: "Predict" + x), on="PredictMisconceptionId")
        # 正解と予測が一致する行を削除
        .filter(pl.col("MisconceptionId") != pl.col("PredictMisconceptionId"))
    )

    return train_retrieved


### Fine-tuning

In [8]:
# 実験結果格納用のディレクトリを作成
japan_tz = pytz.timezone("Asia/Tokyo")
cfg.run_time = datetime.now(japan_tz).strftime("%Y%m%d_%H%M%S")

map_scores = []
recall_scores = []

for i, (train_idx, valid_idx) in enumerate(
    gkf.split(train_long, groups=train_long["QuestionId"], y=train_long["SubjectName"])
):
    save_dir = os.path.join(cfg.data.results_path, f"fold{i+1}")
    create_dir(save_dir)

    model = SentenceTransformer(cfg.model.model_name)

    train_retrieved = make_retrieval_data(train_long[train_idx], mapping, model, cfg.k)
    valid_retrieved = make_retrieval_data(train_long[valid_idx], mapping, model, cfg.k)
    train_dataset = Dataset.from_polars(train_retrieved)
    valid_dataset = Dataset.from_polars(valid_retrieved)
    if DEBUG:
        train_dataset = train_dataset.select(range(50))
        valid_dataset = valid_dataset.select(range(50))
    loss = MultipleNegativesRankingLoss(model)

    print(f"{cfg.model.model_name}のfine-tuningを開始します。({i+1}/{gkf.n_splits}fold)")

    args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir=save_dir,
        # Optional training parameters:
        num_train_epochs=cfg.model.epoch,
        per_device_train_batch_size=cfg.model.batch_size,
        gradient_accumulation_steps=128 // cfg.model.batch_size,
        per_device_eval_batch_size=cfg.model.batch_size,
        eval_accumulation_steps=128 // cfg.model.batch_size,
        learning_rate=cfg.model.lr,
        weight_decay=0.01,
        warmup_ratio=0.1,
        fp16=FP,  # Set to False if you get an error that your GPU can't run on FP16
        bf16=BF,  # Set to True if you have a GPU that supports BF16
        batch_sampler=BatchSamplers.NO_DUPLICATES,  # no duplicate samples in a batch
        # Optional tracking/debugging parameters:
        lr_scheduler_type="cosine_with_restarts",
        evaluation_strategy="steps",
        eval_steps=0.1,
        save_strategy="steps",
        save_steps=0.1,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        logging_steps=100,
        # report_to=REPORT_TO,  # Will be used in W&B if `wandb` is installed
        # run_name=EXP_NAME,
        do_eval=True,
    )

    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset.select_columns(["AllText", "MisconceptionName", "PredictMisconceptionName"]),
        # train_dataset=train_dataset.select_columns(
        #     ["AllText", "MisconceptionName_with_SubjectNames", "PredictMisconceptionName_with_SubjectNames"]
        # ),
        eval_dataset=valid_dataset.select_columns(["AllText", "MisconceptionName", "PredictMisconceptionName"]),
        loss=loss,
    )

    trainer.train()
    model.save_pretrained(save_dir, create_model_card=False)

    # 評価

    valid_long = train_long[valid_idx]
    # 問題文をベクトル化
    valid_long_vec = model.encode(valid_long["AllText"].to_list(), normalize_embeddings=True)
    # 誤概念をベクトル化
    misconception_mapping_vec = model.encode(mapping["MisconceptionName"].to_list(), normalize_embeddings=True)
    # 問題文と誤概念のコサイン類似度を計算
    valid_cos_sim_arr = cosine_similarity(valid_long_vec, misconception_mapping_vec)
    # コサイン類似度が高い順にソート
    valid_sorted_indices = np.argsort(-valid_cos_sim_arr, axis=1)
    # 各問題に対してk個の予測誤概念IDを追加
    valid_long = valid_long.with_columns(
        pl.Series(valid_sorted_indices[:, : cfg.k].tolist()).alias("PredictMisconceptionId")
    )

    actual_misconception_ids = [[mis_id] for mis_id in valid_long["MisconceptionId"].to_list()]
    predicted_misconception_ids = valid_long["PredictMisconceptionId"].to_list()
    # map@25
    map_score = mapk(actual_misconception_ids, predicted_misconception_ids, k=25)
    map_scores.append(map_score)

    # recall@25
    recall_score = recall_at_k(actual_misconception_ids, predicted_misconception_ids, k=25)
    recall_scores.append(recall_score)

    print(f"\n================ fold{i+1} result================\n")
    print(f"map@25: {map_score}")
    print(f"recall@25: {recall_score}")

    del model
    gc.collect()
    torch.cuda.empty_cache()


print("\n================CV result================\n")
print(f"map@25: {np.mean(map_scores)}")
print(f"recall@25: {np.mean(recall_scores)}")




Directory created: ../../results/002/20241115_191836/fold1
BAAI/bge-large-en-v1.5のfine-tuningを開始します。(1/4fold)




Step,Training Loss,Validation Loss
125,1.5914,1.645703
250,1.419,1.436509
375,1.4476,1.41614
500,1.2934,1.314739
625,1.2602,1.224498
750,1.0361,1.280077
875,0.8217,1.191153
1000,0.8105,1.143917
1125,0.7426,1.127625


Error while generating model card:                                   
Traceback (most recent call last):
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py", line 1233, in _create_model_card
    model_card = generate_model_card(self)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/model_card.py", line 962, in generate_model_card
    model_card = ModelCard.from_template(card_data=model.model_card_data, template_path=template_path, hf_emoji="🤗")
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/huggingface_hub/repocard.py", line 416, in from_template
    return super().from_template(card_data, template_path, template_str, **template_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



map@25: 0.3591846734674502
recall@25: 0.8057813911472448
Directory created: ../../results/002/20241115_191836/fold2
BAAI/bge-large-en-v1.5のfine-tuningを開始します。(2/4fold)




Step,Training Loss,Validation Loss
126,1.6878,1.664911
252,1.3839,1.603054
378,1.4478,1.465591
504,1.2594,1.3839
630,1.2045,1.339231
756,1.1256,1.279923
882,0.7707,1.25846
1008,0.7193,1.248567
1134,0.7318,1.218055


Error while generating model card:                                   
Traceback (most recent call last):
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py", line 1233, in _create_model_card
    model_card = generate_model_card(self)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/model_card.py", line 962, in generate_model_card
    model_card = ModelCard.from_template(card_data=model.model_card_data, template_path=template_path, hf_emoji="🤗")
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/huggingface_hub/repocard.py", line 416, in from_template
    return super().from_template(card_data, template_path, template_str, **template_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



map@25: 0.33887390478973967
recall@25: 0.7511520737327189
Directory created: ../../results/002/20241115_191836/fold3
BAAI/bge-large-en-v1.5のfine-tuningを開始します。(3/4fold)




Step,Training Loss,Validation Loss
126,1.6726,1.7071
252,1.4081,1.405868
378,1.4315,1.436536
504,1.2901,1.281688
630,1.2127,1.230119
756,1.1783,1.216008
882,0.8434,1.234019
1008,0.7499,1.191216
1134,0.7636,1.1662


Error while generating model card:                                   
Traceback (most recent call last):
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py", line 1233, in _create_model_card
    model_card = generate_model_card(self)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/sentence_transformers/model_card.py", line 962, in generate_model_card
    model_card = ModelCard.from_template(card_data=model.model_card_data, template_path=template_path, hf_emoji="🤗")
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/kaggle_eedi/.venv/lib/python3.12/site-packages/huggingface_hub/repocard.py", line 416, in from_template
    return super().from_template(card_data, template_path, template_str, **template_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



map@25: 0.3425523545993542
recall@25: 0.7604070305272895
Directory created: ../../results/002/20241115_191836/fold4
BAAI/bge-large-en-v1.5のfine-tuningを開始します。(4/4fold)




Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 22.15 GiB of which 16.00 MiB is free. Process 3759331 has 22.13 GiB memory in use. Of the allocated memory 21.65 GiB is allocated by PyTorch, and 21.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### HuggingFaceにpush

In [17]:
# huggingfaceにpush
from huggingface_hub import HfFolder

# Hugging Faceのトークンを設定
HfFolder.save_token("your_huggingface_token")

# 各foldのモデルをプッシュ
for i in range(cfg.n_splits):
    fold = i + 1
    save_dir = os.path.join(cfg.data.results_path, f"fold{fold}")

    # 保存されたモデルを読み込み
    model = SentenceTransformer(save_dir)

    # モデル名からスラッシュを削除
    model_name = cfg.model.model_name.replace("/", "-")

    # モデルをHugging Faceにpush
    model.push_to_hub(
        f"marumarukun/{model_name}_fine_tuned_fold{fold}_{cfg.run_time}",
        commit_message=f"Add fold{fold} SentenceTransformer model",
    )
    print(f"Fold {fold} モデルのプッシュが完了しました。")


model.safetensors: 100%|██████████| 1.34G/1.34G [04:01<00:00, 5.56MB/s] 


Fold 1 モデルのプッシュが完了しました。


model.safetensors: 100%|██████████| 1.34G/1.34G [00:41<00:00, 32.3MB/s]


Fold 2 モデルのプッシュが完了しました。


model.safetensors: 100%|██████████| 1.34G/1.34G [00:41<00:00, 32.0MB/s]
No sentence-transformers model found with name ../../results/002/20241115_191836/fold4. Creating a new one with mean pooling.


Fold 3 モデルのプッシュが完了しました。


ValueError: Unrecognized model in ../../results/002/20241115_191836/fold4. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glpn, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, graphormer, grounding-dino, groupvit, hiera, hubert, ibert, idefics, idefics2, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, time_series_transformer, timesformer, timm_backbone, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zoedepth