In [2]:
import pickle
import pprint
from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import pytz
import seaborn as sns
from omegaconf import OmegaConf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from src.config import cfg
from src.dir import create_dir
from src.seed import seed_everything

cfg.exp_number = Path().resolve().name
print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)


exp_number: '000'
run_time: base
data:
  input_root: ../../data/input
  train_path: ../../data/input/train.csv
  test_path: ../../data/input/test.csv
  sample_submission_path: ../../data/input/sample_submission.csv
  mapping_path: ../../data/input/misconception_mapping.csv
  output_root: ../../data/output
  results_root: ../../results
  results_path: ../../results/000/base
seed: 42



### データの読み込み

In [3]:
# データの読み込み
train_df = pl.read_csv(cfg.data.train_path, try_parse_dates=True)
test_df = pl.read_csv(cfg.data.test_path, try_parse_dates=True)
sample_submission_df = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
mapping_df = pl.read_csv(cfg.data.mapping_path, try_parse_dates=True)

# # データの結合
# train_test_df = pl.concat([train_df, test_df], how="diagonal")


### vllm trial

In [14]:
from vllm import LLM, SamplingParams

llm = LLM(model="elyza/Llama-3-ELYZA-JP-8B-AWQ", quantization="awq")
tokenizer = llm.get_tokenizer()


  from .autonotebook import tqdm as notebook_tqdm
2024-11-05 19:13:17,710	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 11-05 19:13:21 awq_marlin.py:101] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 11-05 19:13:21 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='elyza/Llama-3-ELYZA-JP-8B-AWQ', speculative_config=None, tokenizer='elyza/Llama-3-ELYZA-JP-8B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, c

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:04<00:04,  4.68s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:05<00:00,  2.60s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:05<00:00,  2.91s/it]


INFO 11-05 19:13:29 model_runner.py:1067] Loading model weights took 5.3440 GB





INFO 11-05 19:13:32 gpu_executor.py:122] # GPU blocks: 1858, # CPU blocks: 2048
INFO 11-05 19:13:32 gpu_executor.py:126] Maximum concurrency for 8192 tokens per request: 3.63x
INFO 11-05 19:13:33 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-05 19:13:33 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-05 19:13:49 model_runner.py:1523] Graph capturing finished in 16 secs.


In [27]:
DEFAULT_SYSTEM_PROMPT = "あなたは日本人のアシスタントで石川県のグルメに精通しています。"
text = """石川県のご当地グルメを下記の選択肢をおすすめ度の高いものから順位付けしてください。
・おでん
・おひろし
・おばら
・おばんざい

### 回答フォーマット
1. おでん
2. おでん
3. おでん
4. おでん
"""

messages = [
    {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
    {"role": "user", "content": text},
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

sampling_params = SamplingParams(
    # n=1,  # Number of output sequences to return for each prompt.
    top_p=0.8,  # Float that controls the cumulative probability of the top tokens to consider.
    temperature=0,  # randomness of the sampling
    seed=777,  # Seed for reprodicibility
    skip_special_tokens=False,  # Whether to skip special tokens in the output.
    max_tokens=512,  # Maximum number of tokens to generate per output sequence.
)

outputs = llm.generate(prompt, sampling_params)


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s, est. speed input: 158.44 toks/s, output: 71.30 toks/s]


In [28]:
print(outputs[0].outputs[0].text)


石川県のご当地グルメを、選択肢をおすすめ度の高いものから順位付けします。

1. おばら
2. おばんざい
3. おひろし
4. おでん


In [38]:
# Qwen2.5-32B-Instruct-AWQ trial --> OOMで実行できず（もっとVRAMの大きいGPUが必要）
model_name = "Qwen/Qwen2.5-32B-Instruct-AWQ"

llm = LLM(
    model=model_name,
    quantization="awq",
    tensor_parallel_size=1,
    gpu_memory_utilization=0.90,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=5120,
    disable_log_stats=True,
)
tokenizer = llm.get_tokenizer()


INFO 11-03 09:22:18 awq_marlin.py:101] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 11-03 09:22:18 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='Qwen/Qwen2.5-32B-Instruct-AWQ', speculative_config=None, tokenizer='Qwen/Qwen2.5-32B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=5120, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, col

Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  20% Completed | 1/5 [00:06<00:27,  6.82s/it]
Loading safetensors checkpoint shards:  40% Completed | 2/5 [00:10<00:15,  5.21s/it]
Loading safetensors checkpoint shards:  60% Completed | 3/5 [00:14<00:08,  4.30s/it]
Loading safetensors checkpoint shards:  80% Completed | 4/5 [00:18<00:04,  4.23s/it]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:23<00:00,  4.52s/it]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:23<00:00,  4.66s/it]



INFO 11-03 12:41:58 model_runner.py:1067] Loading model weights took 18.1449 GB
INFO 11-03 12:57:02 gpu_executor.py:122] # GPU blocks: 2508, # CPU blocks: 1024
INFO 11-03 12:57:02 gpu_executor.py:126] Maximum concurrency for 5120 tokens per request: 7.84x


OutOfMemoryError: CUDA out of memory. Tried to allocate 158.00 MiB. GPU 0 has a total capacity of 11.99 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 34.01 GiB is allocated by PyTorch, with 269.66 MiB allocated in private pools (e.g., CUDA Graphs), and 148.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### 埋め込みモデル trial

In [29]:
# stella_en_400M_v5

from sentence_transformers import SentenceTransformer

# This model supports two prompts: "s2p_query" and "s2s_query" for sentence-to-passage and sentence-to-sentence tasks, respectively.
# They are defined in `config_sentence_transformers.json`
query_prompt_name = "s2p_query"
queries = [
    "What are some ways to reduce stress?",
    "What are the benefits of drinking green tea?",
]
# docs do not need any prompts
docs = [
    "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.",
    "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.",
]

# ！The default dimension is 1024, if you need other dimensions, please clone the model and modify `modules.json` to replace `2_Dense_1024` with another dimension, e.g. `2_Dense_256` or `2_Dense_8192` !
# on gpu
model = SentenceTransformer("dunzhang/stella_en_400M_v5", trust_remote_code=True, device="cuda")
# you can also use this model without the features of `use_memory_efficient_attention` and `unpad_inputs`. It can be worked in CPU.
# model = SentenceTransformer(
#     "dunzhang/stella_en_400M_v5",
#     trust_remote_code=True,
#     device="cpu",
#     config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}
# )
query_embeddings = model.encode(queries, prompt_name=query_prompt_name)
doc_embeddings = model.encode(docs)
print(query_embeddings.shape, doc_embeddings.shape)
# (2, 1024) (2, 1024)

similarities = model.similarity(query_embeddings, doc_embeddings)
print(similarities)
# tensor([[0.8398, 0.2990],
#         [0.3282, 0.8095]])


  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(2, 1024) (2, 1024)
tensor([[0.8398, 0.2990],
        [0.3282, 0.8095]])


In [33]:
# bge-large-en-v1.5

from sentence_transformers import SentenceTransformer

sentences_1 = ["I like to eat pizza", "I like to eat sushi"]
sentences_2 = ["I love pizza", "I love sushi"]
model = SentenceTransformer("BAAI/bge-large-en-v1.5")
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)


[[0.8919014  0.7085041 ]
 [0.70036054 0.9211165 ]]


In [32]:
# gte-large-en-v1.5

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

sentences = ["That is a happy person", "That is a very happy person"]

model = SentenceTransformer("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True, device="cuda")
embeddings = model.encode(sentences)
print(cos_sim(embeddings[0], embeddings[1]))


tensor([[0.9769]])
