In [1]:
!pip install -U /kaggle/input/faiss-cpu-173/faiss_cpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl


Processing /kaggle/input/faiss-cpu-173/faiss_cpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
faiss-cpu is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [2]:
!pip install "huggingface_hub<0.26.0" # Install an older version of huggingface_hub




In [3]:
## 읽기 전용 오류가 발생하는 것을 방지하기 위해 필요
!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers

Processing /kaggle/working/sentence-transformers
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=126122 sha256=d3d4b22274c0126b4c433d3b1fe9d25758a73920cd14bea94ea9a92e1d5e0046
  Stored in directory: /root/.cache/pip/wheels/6c/ea/76/d9a930b223b1d3d5d6aff69458725316b0fe205b854faf1812
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.2.2
    Uninstalling sentence-transformers-2.2.2:
      Successfully uninstalled sentence-transformers-2.2.2
Successfully installed sentence-transformers-2.2.2


In [4]:
!pip install --upgrade blingfire



In [5]:
!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    "cudf-cu12==25.02.*" "dask-cudf-cu12==25.02.*" "cuml-cu12==25.02.*" \
    "cugraph-cu12==25.02.*" "nx-cugraph-cu12==25.02.*" "cuspatial-cu12==25.02.*" \
    "cuproj-cu12==25.02.*" "cuxfilter-cu12==25.02.*" "cucim-cu12==25.02.*" \
    "pylibraft-cu12==25.02.*" "raft-dask-cu12==25.02.*" "cuvs-cu12==25.02.*" \
    "nx-cugraph-cu12==25.02.*"

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==25.02.*
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.2.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dask-cudf-cu12==25.02.*
  Downloading https://pypi.nvidia.com/dask-cudf-cu12/dask_cudf_cu12-25.2.1-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cuml-cu12==25.02.*
  Downloading https://pypi.nvidia.com/cuml-cu12/cuml_cu12-25.2.0-cp310-cp310-manylinux_2_28_x86_64.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cugraph-cu12==25.02.*
  Downloading https://pypi.nvidia.com/cugraph-cu12/cugraph_cu12-25.2.0-cp310-cp31

In [3]:
# 필요한 라이브러리 임포트
import os
import gc
#import pandas as pd
import cudf as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer

# scipy 경고 무시
import warnings
warnings.filterwarnings("ignore")

In [4]:
%%time
# 텍스트 문장 분할 함수 정의
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    EMR에서 문서를 처리하는 주요 도우미 함수입니다.

    :param documents: 문자열인 문서를 포함하는 반복 가능 객체
    :param document_ids: 문서 고유 식별자를 포함하는 반복 가능 객체
    :param split_sentences: 섹션을 문장으로 더 분할할지 여부를 결정하는 플래그
    :param filter_len: 문장의 최소 문자 길이(그렇지 않으면 필터링)
    :param disable_progress_bar: tqdm 진행률 표시줄을 비활성화하는 플래그
    :return: `document_id`, `text`, `section`, `offset` 열을 포함하는 Pandas DataFrame
    """

    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        # cuDF Series를 Python 리스트로 변환하여 sentencize에 전달합니다.
        df = sentencize(df.text.to_arrow().to_pylist(),
                          df.document_id.to_arrow().to_pylist(),
                          df.offset.to_arrow().to_pylist(),
                          filter_len,
                          disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                           document_ids: Iterable,
                           disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    이미징 보고서의 섹션을 가져오고 선택한 섹션만 반환합니다(기본값은 FINDINGS, IMPRESSION 및 ADDENDUM).

    :param documents: 문자열인 문서를 포함하는 반복 가능 객체
    :param document_ids: 문서 고유 식별자를 포함하는 반복 가능 객체
    :param disable_progress_bar: tqdm 진행률 표시줄을 비활성화하는 플래그
    :return: `document_id`, `text`, `offset` 열을 포함하는 Pandas DataFrame
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
                document_ids: Iterable,
                offsets: Iterable[tuple[int, int]],
                filter_len: int = 3,
                disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    문서를 문장으로 분할합니다. `sectionize_documents`와 함께 사용하여 문서를 더 관리하기 쉬운 조각으로 분할할 수 있습니다.
    분할 후 문장을 원본 문서의 위치와 일치시킬 수 있도록 오프셋을 사용합니다.

    :param documents: 문자열인 문서를 포함하는 반복 가능 객체
    :param document_ids: 문서 고유 식별자를 포함하는 반복 가능 객체
    :param offsets: 시작 및 끝 인덱스의 반복 가능 튜플
    :param filter_len: 문장의 최소 문자 길이(그렇지 않으면 필터링)
    :return: `document_id`, `text`, `section`, `offset` 열을 포함하는 Pandas DataFrame
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

CPU times: user 7 µs, sys: 2 µs, total: 9 µs
Wall time: 11 µs


In [5]:
# 설정값 정의
MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
DEVICE = 0
MAX_LENGTH = 384
BATCH_SIZE = 16

In [6]:
%%time
# 데이터 로드
WIKI_PATH = "/kaggle/input/wikipedia-20230701"
wiki_files = os.listdir(WIKI_PATH)
trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv")

CPU times: user 13.6 ms, sys: 71.7 ms, total: 85.3 ms
Wall time: 210 ms


In [7]:
%%time
# 모델 로드
model = SentenceTransformer(MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half()

CPU times: user 5.21 s, sys: 571 ms, total: 5.78 s
Wall time: 3.78 s


In [8]:
%%time
# 사전 계산된 위키백과 인덱스 사용
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")


CPU times: user 327 ms, sys: 4.82 s, total: 5.14 s
Wall time: 1min 7s


In [9]:
%%time
# 프롬프트 임베딩
#prompt_embeddings = model.encode(trn.prompt.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
#prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
prompt_embeddings = model.encode(trn.prompt.to_pandas().values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
_ = gc.collect()

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

CPU times: user 355 ms, sys: 303 ms, total: 658 ms
Wall time: 661 ms


In [10]:
%%time
# 유사도 검색
search_score, search_index = sentence_index.search(prompt_embeddings, 3)
del sentence_index
del prompt_embeddings
_ = gc.collect()

CPU times: user 16min 42s, sys: 831 ms, total: 16min 43s
Wall time: 31.9 s


In [11]:
%%time
# 위키백과 인덱스 파일 로드
df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet", columns=['id', 'file'])
wikipedia_file_data = []
for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)
del df
_ = gc.collect()

  0%|          | 0/200 [00:00<?, ?it/s]

CPU times: user 2.07 s, sys: 143 ms, total: 2.21 s
Wall time: 5.44 s


In [12]:
%%time
# 전체 텍스트 데이터 로드
wiki_text_data = []
#for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
#    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
#    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])
#    _df = _df[_df['id'].isin(_id)]
#    wiki_text_data.append(_df)
#    _ = gc.collect()

for file in tqdm(wikipedia_file_data.file.unique().to_arrow().to_pylist(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].to_arrow().to_pylist()] #.tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df = _df[_df['id'].isin(_id)]
    wiki_text_data.append(_df)
    _ = gc.collect()
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

  0%|          | 0/28 [00:00<?, ?it/s]

CPU times: user 17.4 s, sys: 1.95 s, total: 19.3 s
Wall time: 39.7 s


In [13]:
%%time
# 문장 분할
#processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)
processed_wiki_text_data = process_documents(wiki_text_data.text.to_arrow().to_pylist(), wiki_text_data.id.to_arrow().to_pylist())

  0%|          | 0/576 [00:00<?, ?it/s]

  0%|          | 0/576 [00:00<?, ?it/s]

CPU times: user 1.29 s, sys: 21.3 ms, total: 1.31 s
Wall time: 1.2 s


In [15]:
%%time
# 문장 임베딩
#wiki_data_embeddings = model.encode(processed_wiki_text_data.text, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
#wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()
wiki_data_embeddings = model.encode(processed_wiki_text_data.text.to_arrow().to_pylist(), batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()
_ = gc.collect()

Batches:   0%|          | 0/2020 [00:00<?, ?it/s]

CPU times: user 19.8 s, sys: 8.03 s, total: 27.9 s
Wall time: 10.4 s


In [16]:
# 질문+답변 선택지 임베딩
#trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)
#trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']
#question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
#question_embeddings = question_embeddings.detach().cpu().numpy()

ValueError: user defined function compilation failed.

In [17]:
trn['answer_all'] = trn['A'].str.cat(
    [trn['B'], trn['C'], trn['D'], trn['E']], sep=" "
)
trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']

In [18]:
%%time
import pandas as pd_original  # Rename the original pandas

# Convert the cuDF DataFrame to a Pandas DataFrame
trn_pd = trn.to_pandas()

# Apply the function on the Pandas DataFrame
trn_pd['answer_all'] = trn_pd.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)
trn_pd['prompt_answer_stem'] = trn_pd['prompt'] + " " + trn_pd['answer_all']

# Convert the Pandas DataFrame back to a cuDF DataFrame
trn = pd.from_pandas(trn_pd) # Use pd (cudf) here to access from_pandas

CPU times: user 25.2 ms, sys: 0 ns, total: 25.2 ms
Wall time: 29.9 ms


In [20]:
%%time

# Example setup, assuming you already have a cuDF DataFrame 'trn_g' with columns 'A', 'B', 'C', 'D', 'E', and 'prompt'

try:
    # Combine all answers
    if 'B' in trn_g.columns and 'C' in trn_g.columns and 'D' in trn_g.columns and 'E' in trn_g.columns:
        trn_g['answer_all'] = trn_g['A'].str.cat([trn_g['B'], trn_g['C'], trn_g['D'], trn_g['E']], sep=' ')
    else:
        raise ValueError("One or more required columns are missing.")

    # Concatenate prompt with answers
    trn_g['prompt_answer_stem'] = trn_g['prompt'].str.cat(trn_g['answer_all'], sep=' ')

except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: name 'trn_g' is not defined
CPU times: user 66 µs, sys: 1 µs, total: 67 µs
Wall time: 57.7 µs


In [21]:
question_embeddings = model.encode(trn['prompt_answer_stem'].to_arrow().to_pylist(), batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
question_embeddings = question_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [33]:
# 컨텍스트 검색 및 결합 using cuvs
import cupy as cp
from cuvs.neighbors import cagra

NUM_SENTENCES_INCLUDE = 3  # 포함할 문장의 개수
prompt_contexts = []
contexts = []

# 위키 데이터 임베딩을 GPU에 올리고 인덱스 생성
index_params = cagra.IndexParams()  # 필요에 따라 파라미터를 수정
wiki_data_embeddings = cp.asarray(wiki_data_embeddings, dtype=cp.float32)  # GPU에 올리고 cp.float32 형식으로 변환
wiki_index = cagra.build(index_params, wiki_data_embeddings)

for r in trn.to_pandas().itertuples():  # Pandas DataFrame을 순회
    prompt_context = ""
    prompt_id = r.id

    # 현재 프롬프트에 해당하는 질문 임베딩을 GPU에 올림
    query_vector = cp.asarray(question_embeddings[prompt_id], dtype=cp.float32).reshape(1, -1)

    # 검색 파라미터 설정
    search_params = cagra.SearchParams(max_queries=100, itopk_size=64)

    # 검색 실행
    distances, indices = cagra.search(search_params, wiki_index, query_vector, NUM_SENTENCES_INCLUDE)

    prompt_context += f"Question: {trn.prompt.iloc[prompt_id]}\n"
    prompt_context += "Choices:\n"
    prompt_context += f"(A) {trn.A.iloc[prompt_id]}\n"
    prompt_context += f"(B) {trn.B.iloc[prompt_id]}\n"
    prompt_context += f"(C) {trn.C.iloc[prompt_id]}\n"
    prompt_context += f"(D) {trn.D.iloc[prompt_id]}\n"
    prompt_context += f"(E) {trn.E.iloc[prompt_id]}\n"

    # 유효한 결과가 있는 경우 문맥 추가
    if indices.shape[0] > 0:
        prompt_context += "Context:\n"
        context = ""
        # GPU에 있는 distances와 indices를 CPU의 NumPy 배열로 변환
        distances_cpu = cp.asnumpy(distances)
        indices_cpu = cp.asnumpy(indices)
        # 각 후보 인덱스와 거리를 동시에 순회
        for candidate_idx, candidate_distance in zip(indices_cpu[0], distances_cpu[0]):
            if candidate_distance < 2:  # 거리 임계값
                context += "[*] " + processed_wiki_text_data['text'].iloc[candidate_idx] + "\n"
        prompt_context += context

    contexts.append(context)
    prompt_contexts.append(prompt_context)


In [34]:
trn['context'] = contexts
trn.to_csv("./train_context.csv", index=False)

In [35]:
# 결과 출력
for i, p in enumerate(prompt_contexts[:10]):
    print(f"Question {i}")
    print(p)
    print()

Question 0
Question: Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
Choices:
(A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
(B) MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
(C) MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
(D) MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 2.
(E) MOND is a theory that 

In [36]:
import importlib.metadata

# 설치된 패키지 목록 가져오기
installed_packages = importlib.metadata.distributions()

# 각 패키지의 이름과 버전 출력
for package in installed_packages:
    print(package.metadata["Name"], package.metadata["Version"])

nx-cugraph-cu12 25.2.0
nvidia-ml-py 12.570.86
raft-dask-cu12 25.2.0
sortedcontainers 2.4.0
pyct 0.5.0
sentence-transformers 2.2.2
ucxx-cu12 0.42.0
libcuspatial-cu12 25.2.0
cucim-cu12 25.2.0
pynvml 12.0.0
dask-expr 1.1.21
libraft-cu12 25.2.0
distributed-ucxx-cu12 0.42.0
distributed 2024.12.1
cuxfilter-cu12 25.2.0
faiss-cpu 1.7.3
huggingface-hub 0.25.2
pylibcugraph-cu12 25.2.0
cugraph-cu12 25.2.0
ucx-py-cu12 0.42.0
cuda-bindings 12.8.0
dask 2024.12.1
pylibraft-cu12 25.2.0
libucx-cu12 1.18.0
cuproj-cu12 25.2.0
treelite 4.4.1
numba-cuda 0.2.0
libkvikio-cu12 25.2.1
datashader 0.17.0
cudf-cu12 25.2.1
tblib 3.0.0
rmm-cu12 25.2.0
rapids-dask-dependency 25.2.0
zict 3.0.0
cuvs-cu12 25.2.0
libucxx-cu12 0.42.0
cuml-cu12 25.2.0
cuda-python 12.8.0
dask-cuda 25.2.0
libcudf-cu12 25.2.1
cuspatial-cu12 25.2.0
dask-cudf-cu12 25.2.1
libcuml-cu12 25.2.0
libcuvs-cu12 25.2.0
pylibcudf-cu12 25.2.1
simpervisor 1.0.0
nvidia-nvcomp-cu12 4.2.0.11
libcugraph-cu12 25.2.0
jupyter_server_proxy 4.4.0
blingfire 0.1.8
g