In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd


# 베이스 경로 설정 (너의 폴더 경로에 맞게 수정)
BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText/'
SAVE_PATH = BASE_DIR + 'data/train_sbert_embeddings.npy'

Mounted at /content/drive


In [None]:
CHUNK_SIZE = 50000  # 문단 50000개 단위로 저장

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

# SBERT 한국어용 추천 모델:
model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

In [None]:
def process_embedding(data_df, text_column, save_filename, batch_size=128):
    texts = data_df[text_column].tolist()
    total_samples = len(texts)
    save_path = BASE_DIR + f'data/{save_filename}'

    print(f"\n==== Start embedding: {save_filename} ====")
    print(f"총 샘플 수: {total_samples}")

    # 중간 저장 체크
    saved_chunks = []
    if os.path.exists(save_path):
        print(f"기존 저장 파일 존재: {save_filename} → 이어서 진행")
        saved_embeddings = np.load(save_path)
        start_idx = saved_embeddings.shape[0]
        print(f"기존 저장된 임베딩 개수: {start_idx}")
    else:
        saved_embeddings = np.empty((0, 768), dtype=np.float32)
        start_idx = 0

    print(f"시작 인덱스: {start_idx} / 총: {total_samples}")

    for i in tqdm(range(start_idx, total_samples, CHUNK_SIZE)):
        batch_texts = texts[i: i + CHUNK_SIZE]
        print(f"\n[{i}~{i+len(batch_texts)-1}] 임베딩 중...")

        embeddings = model.encode(batch_texts, batch_size=batch_size, show_progress_bar=True)
        saved_chunks.extend(embeddings.tolist())

        # 중간 저장
        np.save(save_path, np.array(saved_chunks))
        print(f"[{i}~{i+len(batch_texts)-1}] 저장 완료! 현재까지 shape: {np.array(saved_chunks).shape}")

    print(f"{save_filename} 임베딩 최종 완료!")
    print(f"최종 Shape: {np.array(saved_chunks).shape}\n")


In [None]:
# 4. Train 데이터 임베딩
train_paragraph_df = pd.read_csv(BASE_DIR + 'data/train_paragraph.csv')
process_embedding(train_paragraph_df, text_column='paragraph_text', save_filename='train_sbert_embeddings.npy', batch_size=512)


==== Start embedding: train_sbert_embeddings.npy ====
총 샘플 수: 1226364
✅ 기존 저장 파일 존재: train_sbert_embeddings.npy → 이어서 진행
✅ 기존 저장된 임베딩 개수: 1226364
시작 인덱스: 1226364 / 총: 1226364


0it [00:00, ?it/s]

✅ train_sbert_embeddings.npy 임베딩 최종 완료!
최종 Shape: (0,)






In [None]:
# 5. Test 데이터 임베딩
test_df = pd.read_csv(BASE_DIR + 'data/test.csv')
process_embedding(test_df, text_column='paragraph_text', save_filename='test_sbert_embeddings.npy', batch_size=1024)


==== Start embedding: test_sbert_embeddings.npy ====
총 샘플 수: 1962
시작 인덱스: 0 / 총: 1962


  0%|          | 0/1 [00:00<?, ?it/s]


[0~1961] 임베딩 중...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:02<00:00,  2.95s/it]

[0~1961] 저장 완료! 현재까지 shape: (1962, 768)
✅ test_sbert_embeddings.npy 임베딩 최종 완료!
최종 Shape: (1962, 768)




