In [1]:
import re
from pathlib import Path

import polars as pl
import vllm
from omegaconf import OmegaConf
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import GroupKFold
from transformers import AutoTokenizer

from src.config import cfg
from src.data import add_subject_name_info, preprocess_train
from src.prompt import create_prompt
from src.seed import seed_everything

cfg.exp_number = Path().resolve().name
print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)

pl.Config.set_fmt_str_lengths(100000)


  from .autonotebook import tqdm as notebook_tqdm


INFO 11-10 21:35:40 importing.py:10] Triton not installed; certain GPU-related functions will not be available.


2024-11-10 21:35:40,927	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


exp_number: '001'
run_time: base
data:
  input_root: ../../data/input
  train_path: ../../data/input/train.csv
  test_path: ../../data/input/test.csv
  sample_submission_path: ../../data/input/sample_submission.csv
  mapping_path: ../../data/input/misconception_mapping.csv
  mapping_meta_path: ../../data/input/mapping_meta.parquet
  output_root: ../../data/output
  results_root: ../../results
  results_path: ../../results/001/base
seed: 42
embed_model: BAAI/bge-large-en-v1.5
k: 50
llm_model: Qwen/Qwen2.5-32B-Instruct-AWQ



polars.config.Config

### 準備

In [2]:
# データの読み込み
train_df = pl.read_csv(cfg.data.train_path, try_parse_dates=True)
test_df = pl.read_csv(cfg.data.test_path, try_parse_dates=True)
sample_submission_df = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
mapping_df = pl.read_csv(cfg.data.mapping_path, try_parse_dates=True)

# CV
gkf = GroupKFold(n_splits=5)


In [3]:
# 埋め込みモデル
model = SentenceTransformer(cfg.embed_model, trust_remote_code=True)


In [4]:
# # llmの準備
# llm = vllm.LLM(
#     cfg.llm_model,
#     quantization="awq",
#     tensor_parallel_size=1,
#     gpu_memory_utilization=0.90,
#     trust_remote_code=True,
#     dtype="half",
#     enforce_eager=True,
#     max_model_len=3824,
#     disable_log_stats=True,
# )
# tokenizer = llm.get_tokenizer()


In [5]:
# tokenizerを準備
tokenizer = AutoTokenizer.from_pretrained(cfg.llm_model)


In [6]:
for train_idx, valid_idx in gkf.split(train_df, groups=train_df["QuestionId"]):
    # train_dfの分割
    train = train_df[train_idx]
    valid = train_df[valid_idx]

    # trainのSubjectName情報をmapping_dfに追加
    mapping_meta = add_subject_name_info(train, mapping_df)

    # trainの前処理
    train_long = preprocess_train(train)

    # 埋め込みモデルでベクトル化（1st stage）
    train_long_embed = model.encode(train_long["AllText"].to_list(), normalize_embeddings=True)
    misconception_vec = model.encode(
        mapping_meta["MisconceptionName_with_SubjectNames"].to_list(), normalize_embeddings=True
    )

    # 埋め込みからTOPkを抽出
    topk_ids = util.semantic_search(train_long_embed, misconception_vec, top_k=cfg.k)

    # LLMによる絞り込み(2nd stage)

    # promptを作成
    train_long = create_prompt(topk_ids, mapping_meta, train_long, tokenizer, cfg.k)
    break
train_long


In [7]:
train_long.head()


AttributeError: 'NoneType' object has no attribute 'head'