In [12]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [13]:
# 필요한 라이브러리 import
import json
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime

In [14]:
# 설정
MODEL_NAME = "BAAI/bge-large-en-v1.5"
DATA_DIR = Path("guardian_top100_scraping")
OUTPUT_DIR = Path("vector_chunking")
BATCH_SIZE = 32
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

# Chunking 설정
CHUNK_SIZE = 2  # 두 문장씩
OVERLAP = 1     # 한 문장씩 overlap

OUTPUT_DIR.mkdir(exist_ok=True)

In [15]:
# Model과 Tokenizer load
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.cuda()
model.eval()
print("Model loaded successfully!")

Loading model...
Model loaded successfully!


In [16]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
    return dt.strftime("%Y_%m_%d")

def split_into_sentences(text):
    """텍스트를 문장 단위로 분리 (. 기준)"""
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    return sentences

def create_chunks(sentences, chunk_size=2, overlap=1):
    """문장들을 chunk_size만큼 묶고, overlap만큼 겹치게 생성"""
    chunks = []
    step = chunk_size - overlap
    
    for i in range(0, len(sentences), step):
        chunk_sentences = sentences[i:i+chunk_size]
        if chunk_sentences:
            chunk_text = '. '.join(chunk_sentences) + '.'
            chunks.append(chunk_text)
        
        if i + chunk_size >= len(sentences):
            break
    
    return chunks

@torch.no_grad()
def generate_embeddings(texts, batch_size=32):
    """Batch 단위로 embedding 생성"""
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(batch, padding=True, truncation=True, 
                          max_length=512, return_tensors='pt')
        encoded = {k: v.cuda() for k, v in encoded.items()}
        
        # Generate embeddings
        outputs = model(**encoded)
        # CLS token embedding 사용
        batch_embeddings = outputs.last_hidden_state[:, 0, :]
        
        # Normalize
        batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
        
        embeddings.append(batch_embeddings.cpu().numpy())
    
    return np.vstack(embeddings)

def mean_pool_embeddings(embeddings):
    """여러 chunk embeddings를 mean pooling"""
    return np.mean(embeddings, axis=0)

In [17]:
# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    with open(CHECKPOINT_FILE, 'r') as f:
        checkpoint = json.load(f)
        processed_files = set(checkpoint.get('processed_files', []))
        print(f"Checkpoint found: {len(processed_files)} files already processed")

Checkpoint found: 1 files already processed


In [18]:
# 모든 .jsonl 파일 수집
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
print(f"Total files to process: {len(jsonl_files)}")

Total files to process: 99


In [19]:
# 데이터 수집 및 embedding 생성 (인물별로 처리)
all_embeddings = []
all_metadata = []
total_articles = 0

for idx, file_path in enumerate(jsonl_files):
    person = extract_person_name(file_path)
    print(f"\n[{idx+1}/{len(jsonl_files)}] Processing: {person}")
    
    # 파일에서 기사 읽기
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = [json.loads(line) for line in f]
    
    # 현재 인물의 기사별 처리
    person_embeddings = []
    person_metadata = []
    article_count = 0
    
    for article in articles:
        body_text = article.get('bodyText', '')
        if not body_text:
            continue
        
        # 문장 분리
        sentences = split_into_sentences(body_text)
        if not sentences:
            continue
        
        # Chunks 생성
        chunks = create_chunks(sentences, chunk_size=CHUNK_SIZE, overlap=OVERLAP)
        if not chunks:
            continue
        
        # 각 chunk의 embedding 생성
        chunk_embeddings = generate_embeddings(chunks, batch_size=BATCH_SIZE)
        
        # Mean pooling: 한 기사의 모든 chunk embeddings를 평균
        article_embedding = mean_pool_embeddings(chunk_embeddings)
        
        person_embeddings.append(article_embedding)
        person_metadata.append({
            'person': person,
            'article_id': article.get('id', ''),
            'pub_date': parse_pub_date(article.get('webPublicationDate', ''))
        })
        article_count += 1
    
    # 현재 인물의 결과 저장
    if person_embeddings:
        person_embeddings_array = np.array(person_embeddings)
        all_embeddings.append(person_embeddings_array)
        all_metadata.extend(person_metadata)
        total_articles += article_count
        print(f"  ✓ Done! Processed {article_count} articles (Total: {total_articles} articles)")
    
    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

# 모든 embeddings 합치기
print(f"\nCombining all embeddings...")
embeddings = np.vstack(all_embeddings)
print(f"Final embeddings shape: {embeddings.shape}")
print(f"Total articles: {len(all_metadata)}")

# 5829 articles -> 18분 정도


[1/99] Processing: alicia_keys


KeyboardInterrupt: 

In [None]:
# 결과 저장
print("\nSaving results...")

# embeddings.npy 저장
np.save(OUTPUT_DIR / "embeddings.npy", embeddings)
print(f"Saved: {OUTPUT_DIR / 'embeddings.npy'}")

# metadata.jsonl 저장
with open(OUTPUT_DIR / "metadata.jsonl", 'w', encoding='utf-8') as f:
    for meta in all_metadata:
        f.write(json.dumps(meta, ensure_ascii=False) + '\n')
print(f"Saved: {OUTPUT_DIR / 'metadata.jsonl'}")

# Checkpoint 삭제 (완료되었으므로)
if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()

print("\nAll done!")