In [1]:
import pickle
import pprint
from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import pytz
import seaborn as sns
from omegaconf import OmegaConf
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import GroupKFold

from src.config import cfg
from src.data import add_subject_name_info, preprocess_train
from src.dir import create_dir
from src.seed import seed_everything

cfg.exp_number = Path().resolve().name
print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)


  from tqdm.autonotebook import tqdm, trange


exp_number: '000'
run_time: base
data:
  input_root: ../../data/input
  train_path: ../../data/input/train.csv
  test_path: ../../data/input/test.csv
  sample_submission_path: ../../data/input/sample_submission.csv
  mapping_path: ../../data/input/misconception_mapping.csv
  output_root: ../../data/output
  results_root: ../../results
  results_path: ../../results/000/base
seed: 42
k: 50



### データの読み込み

In [2]:
# データの読み込み
train_df = pl.read_csv(cfg.data.train_path, try_parse_dates=True)
test_df = pl.read_csv(cfg.data.test_path, try_parse_dates=True)
sample_submission_df = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
mapping_df = pl.read_csv(cfg.data.mapping_path, try_parse_dates=True)

# CV
gkf = GroupKFold(n_splits=5)


In [3]:
# 比較したい埋め込みモデルをここに追加していく(MTEBランクは2024/11/09時点)
model_names = [
    "../../data/Joseph-Eedi-finetuned-bge",  # kaggle codeから拝借してきたモデル
    # https://www.kaggle.com/code/takanashihumbert/eedi-qwen-2-5-32b-awq-two-time-retrieval
    "BAAI/bge-large-en-v1.5",  # MTEB rank: 42, Model size: 335(Million parameters)
    "dunzhang/stella_en_400M_v5",  # MTEB rank: 6, Model size: 435(Million parameters)
    # "dunzhang/stella_en_1.5B_v5",  # MTEB rank: 3, Model size: 1543(Million parameters)
    "Alibaba-NLP/gte-large-en-v1.5",  # MTEB rank: 28, Model size: 434(Million parameters)
    "jinaai/jina-embeddings-v3",  # MTEB rank: 25, Model size: 572(Million parameters)
]
task = "text-matching"  # jina-embeddings-v3にはtaskが必要そう

# NOTE: ローカルでは動作しないので、stella_en_1.5B_v5は一旦除外


# # modelのロードと埋め込みができるか試す
# for model_name in model_names:
#     model = SentenceTransformer(model_name, trust_remote_code=True)
#     print(f"モデル: {model_name} ロードOK")
#     embed_trial = model.encode(train_df["SubjectName"].to_list()[:5], normalize_embeddings=True)
#     print(f"{model_name} 埋め込みテストOK\n")


In [5]:
print(f"topk: {cfg.k}")


topk: 50


In [4]:
# 埋め込みモデルの比較をCVで行う

# QuestionIdでGroupKFold
for model_name in model_names:
    print(f"モデル: {model_name}")

    model = SentenceTransformer(model_name, trust_remote_code=True)

    cv_scores = []
    for i, (train_idx, valid_idx) in enumerate(gkf.split(train_df, groups=train_df["QuestionId"])):
        # train_dfの分割
        train = train_df[train_idx]
        valid = train_df[valid_idx]

        # trainのSubjectName情報をmapping_dfに追加
        mapping_meta = add_subject_name_info(train, mapping_df)

        # trainの前処理
        train_long = preprocess_train(train)

        # 埋め込みモデルでベクトル化
        train_long_embed = model.encode(train_long["AllText"].to_list(), normalize_embeddings=True)
        misconception_vec = model.encode(
            mapping_meta["MisconceptionName_with_SubjectNames"].to_list(), normalize_embeddings=True
        )
        # jina-embeddings-v3の場合のみtaskを指定
        if model_name == "jinaai/jina-embeddings-v3":
            train_long_embed = model.encode(
                train_long["AllText"].to_list(), task=task, prompt_name=task, normalize_embeddings=True
            )
            misconception_vec = model.encode(
                mapping_meta["MisconceptionName_with_SubjectNames"].to_list(),
                task=task,
                prompt_name=task,
                normalize_embeddings=True,
            )

        # 埋め込みからTOPkを抽出
        topkids = util.semantic_search(train_long_embed, misconception_vec, top_k=cfg.k)

        # topkidsからk個のcorpus_id(=misconception_id)を抽出
        topkids_lists = [[qid["corpus_id"] for qid in topkids] for topkids in topkids]
        # gt_misconception_idを抽出
        gt_misconception_ids = train_long["MisconceptionId"].to_list()

        # 抽出したTOPkの中にgt_misconception_idがあるか確認
        is_gt_in_topk = []
        for topkids_list, gt_misconception_id in zip(topkids_lists, gt_misconception_ids, strict=True):
            if gt_misconception_id in topkids_list:
                is_gt_in_topk.append(True)
            else:
                is_gt_in_topk.append(False)

        # 平均をとってCVスコアとする
        avg_score = np.mean(is_gt_in_topk)
        cv_scores.append(avg_score)
        print(f"Fold {i+1}: {avg_score}")

    print(f"CVスコア: {np.mean(cv_scores)}\n")


モデル: ../../data/Joseph-Eedi-finetuned-bge
Fold 1: 0.919327251995439
Fold 2: 0.9218213058419243
Fold 3: 0.9251934651762683
Fold 4: 0.9172867773325701
Fold 5: 0.9156419788390049
CVスコア: 0.9198541558370413

モデル: BAAI/bge-large-en-v1.5
Fold 1: 0.8563283922462942
Fold 2: 0.8528064146620847
Fold 3: 0.8512467755803955
Fold 4: 0.8611906124785347
Fold 5: 0.8613096940234487
CVスコア: 0.8565763777981517

モデル: dunzhang/stella_en_400M_v5


  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold 1: 0.9005131128848347
Fold 2: 0.9003436426116839
Fold 3: 0.8942390369733448
Fold 4: 0.8995420721236406
Fold 5: 0.9047755218758936
CVスコア: 0.8998826772938795

モデル: Alibaba-NLP/gte-large-en-v1.5
Fold 1: 0.8717217787913341
Fold 2: 0.8671248568155785
Fold 3: 0.8641444539982803
Fold 4: 0.8709215798511735
Fold 5: 0.8713182728052616
CVスコア: 0.8690461884523255

モデル: jinaai/jina-embeddings-v3


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Fold 1: 0.8985176738882554
Fold 2: 0.9029209621993127
Fold 3: 0.8939524218973918
Fold 4: 0.9009730967372639
Fold 5: 0.9079210752073206
CVスコア: 0.9008570459859089



In [6]:
# 埋め込みモデルの比較をCVで行う

cfg.k = 75
print(f"topk: {cfg.k}")

# QuestionIdでGroupKFold
for model_name in model_names:
    print(f"モデル: {model_name}")

    model = SentenceTransformer(model_name, trust_remote_code=True)

    cv_scores = []
    for i, (train_idx, valid_idx) in enumerate(gkf.split(train_df, groups=train_df["QuestionId"])):
        # train_dfの分割
        train = train_df[train_idx]
        valid = train_df[valid_idx]

        # trainのSubjectName情報をmapping_dfに追加
        mapping_meta = add_subject_name_info(train, mapping_df)

        # trainの前処理
        train_long = preprocess_train(train)

        # 埋め込みモデルでベクトル化
        train_long_embed = model.encode(train_long["AllText"].to_list(), normalize_embeddings=True)
        misconception_vec = model.encode(
            mapping_meta["MisconceptionName_with_SubjectNames"].to_list(), normalize_embeddings=True
        )
        # jina-embeddings-v3の場合のみtaskを指定
        if model_name == "jinaai/jina-embeddings-v3":
            train_long_embed = model.encode(
                train_long["AllText"].to_list(), task=task, prompt_name=task, normalize_embeddings=True
            )
            misconception_vec = model.encode(
                mapping_meta["MisconceptionName_with_SubjectNames"].to_list(),
                task=task,
                prompt_name=task,
                normalize_embeddings=True,
            )

        # 埋め込みからTOPkを抽出
        topkids = util.semantic_search(train_long_embed, misconception_vec, top_k=cfg.k)

        # topkidsからk個のcorpus_id(=misconception_id)を抽出
        topkids_lists = [[qid["corpus_id"] for qid in topkids] for topkids in topkids]
        # gt_misconception_idを抽出
        gt_misconception_ids = train_long["MisconceptionId"].to_list()

        # 抽出したTOPkの中にgt_misconception_idがあるか確認
        is_gt_in_topk = []
        for topkids_list, gt_misconception_id in zip(topkids_lists, gt_misconception_ids, strict=True):
            if gt_misconception_id in topkids_list:
                is_gt_in_topk.append(True)
            else:
                is_gt_in_topk.append(False)

        # 平均をとってCVスコアとする
        avg_score = np.mean(is_gt_in_topk)
        cv_scores.append(avg_score)
        print(f"Fold {i+1}: {avg_score}")

    print(f"CVスコア: {np.mean(cv_scores)}\n")


topk: 75
モデル: ../../data/Joseph-Eedi-finetuned-bge
Fold 1: 0.9466932725199544
Fold 2: 0.9455899198167239
Fold 3: 0.9498423617082259
Fold 4: 0.9419004006868918
Fold 5: 0.9453817557906777
CVスコア: 0.9458815421044948

モデル: BAAI/bge-large-en-v1.5
Fold 1: 0.9025085518814139
Fold 2: 0.8966208476517755
Fold 3: 0.8965319575809687
Fold 4: 0.9024041213508872
Fold 5: 0.9082070346010866
CVスコア: 0.9012545026132264

モデル: dunzhang/stella_en_400M_v5


Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold 1: 0.9370011402508552
Fold 2: 0.9375715922107675
Fold 3: 0.9314989968472341
Fold 4: 0.9419004006868918
Fold 5: 0.9370889333714613
CVスコア: 0.9370122126734419

モデル: Alibaba-NLP/gte-large-en-v1.5
Fold 1: 0.9124857468643102
Fold 2: 0.9115120274914089
Fold 3: 0.9122957867583835
Fold 4: 0.9158557527189468
Fold 5: 0.9190734915641979
CVスコア: 0.9142445610794494

モデル: jinaai/jina-embeddings-v3


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Fold 1: 0.9324401368301026
Fold 2: 0.9338487972508591
Fold 3: 0.9286328460877042
Fold 4: 0.9298797939324557
Fold 5: 0.9425221618530168
CVスコア: 0.9334647471908276



### アンサンブルすると精度上がるか検証してみる

In [None]:
# TODO: アンサンブルすると精度上がるか検証してみる
