In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:
# 라이브러리 import
import json
import numpy as np
from pathlib import Path
from transformers import AutoModel, AutoTokenizer
import torch
from datetime import datetime
import time
from tqdm import tqdm

In [None]:
# ===== GPU 분할 설정 =====
GPU_ID = 0
START_IDX = 0
END_IDX = 100
# ======================================================

# 설정
MODEL_NAME = "jinaai/jina-embeddings-v3"
DATA_DIR = Path("../guardian_top100_scraping")
OUTPUT_DIR = Path(f"vector_bodyText_gpu{GPU_ID}")
BATCH_SIZE = 16
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

# Jina v3 토큰 설정
MAX_TOKENS = 8192
CHUNK_TOKENS = 8000
OVERLAP_TOKENS = 100

OUTPUT_DIR.mkdir(exist_ok=True)
print(f"GPU {GPU_ID}: Processing people {START_IDX} to {END_IDX-1}")

GPU 0: Processing people 0 to 49


In [None]:
# Model과 Tokenizer load
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = model.cuda()
model.eval()
print("Model loaded successfully!")

Loading model...


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Model loaded successfully!


In [None]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
    return dt.strftime("%Y_%m_%d")

def chunk_text_by_tokens(text, tokenizer, max_tokens=8000, overlap_tokens=100):
    """텍스트를 토큰 기준으로 청킹 (overlap 포함)"""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    if len(tokens) <= max_tokens:
        return [text]
    
    chunks = []
    step = max_tokens - overlap_tokens
    
    for i in range(0, len(tokens), step):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
        
        if i + max_tokens >= len(tokens):
            break
    
    return chunks

@torch.no_grad()
def generate_embeddings(texts, batch_size=16):
    """Batch 단위로 embedding 생성 (Jina v3 사용)"""
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        # 각 텍스트를 개별적으로 처리 (truncation 보장)
        batch_embeddings = []
        for text in batch:
            # Tokenize with strict truncation
            tokens = tokenizer.encode(
                text,
                add_special_tokens=True,
                max_length=MAX_TOKENS,
                truncation=True
            )
            
            # Decode back to ensure it's within limit
            truncated_text = tokenizer.decode(tokens, skip_special_tokens=True)
            
            # Get embedding using model's encode method
            emb = model.encode(
                [truncated_text],
                task='text-matching',
                show_progress_bar=False,
                convert_to_numpy=True,
                normalize_embeddings=True
            )
            batch_embeddings.append(emb[0])
        
        all_embeddings.extend(batch_embeddings)
    
    return np.array(all_embeddings)

def mean_pool_embeddings(embeddings):
    """여러 chunk embeddings를 mean pooling"""
    return np.mean(embeddings, axis=0)

In [None]:
# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    with open(CHECKPOINT_FILE, 'r') as f:
        checkpoint = json.load(f)
        processed_files = set(checkpoint.get('processed_files', []))
        print(f"Checkpoint found: {len(processed_files)} files already processed")

Checkpoint found: 9 files already processed


In [None]:
# 기존 저장된 데이터 로드
embeddings_file = OUTPUT_DIR / "embeddings.npy"
metadata_file = OUTPUT_DIR / "metadata.jsonl"

if embeddings_file.exists() and metadata_file.exists():
    print("Loading existing data...")
    existing_embeddings = np.load(embeddings_file)
    existing_metadata = []
    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            existing_metadata.append(json.loads(line))
    print(f"Loaded {len(existing_metadata)} existing entries")
else:
    existing_embeddings = None
    existing_metadata = []
    print("Starting fresh (no existing data found)")

Loading existing data...
Loaded 34288 existing entries


In [None]:
# 모든 .jsonl 파일 수집 및 GPU별 분할
all_files = sorted([f for f in DATA_DIR.glob("*.jsonl")])
jsonl_files = all_files[START_IDX:END_IDX]
jsonl_files = [f for f in jsonl_files if f.name not in processed_files]

print(f"Total files in this GPU: {len(jsonl_files)}")

Total files in this GPU: 41


In [None]:
# 데이터 수집 및 embedding 생성 (인물별로 처리)
for idx, file_path in enumerate(jsonl_files):
    person = extract_person_name(file_path)
    start_time = time.time()
    
    print(f"\n[{START_IDX + idx + 1}/{len(jsonl_files)}] Processing: {person}")
    
    # 파일에서 기사 읽기
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = [json.loads(line) for line in f]
    
    print(f"  Total articles: {len(articles)}")
    
    # 현재 인물의 기사별 처리
    person_embeddings = []
    person_metadata = []
    
    # Progress bar로 기사 처리
    for article in tqdm(articles, desc=f"  Processing articles", leave=False):
        body_text = article.get('bodyText', '')
        if not body_text:
            continue
        
        # 토큰 기준으로 청킹
        chunks = chunk_text_by_tokens(
            body_text, 
            tokenizer, 
            max_tokens=CHUNK_TOKENS, 
            overlap_tokens=OVERLAP_TOKENS
        )
        
        if not chunks:
            continue
        
        # 각 chunk의 embedding 생성
        chunk_embeddings = generate_embeddings(chunks, batch_size=BATCH_SIZE)
        
        # Mean pooling
        article_embedding = mean_pool_embeddings(chunk_embeddings)
        
        person_embeddings.append(article_embedding)
        person_metadata.append({
            'person': person,
            'article_id': article.get('id', ''),
            'pub_date': parse_pub_date(article.get('webPublicationDate', ''))
        })
    
    # 현재 인물의 결과 저장
    if person_embeddings:
        person_embeddings_array = np.array(person_embeddings)
        
        # 기존 데이터와 합치기
        if existing_embeddings is not None:
            combined_embeddings = np.vstack([existing_embeddings, person_embeddings_array])
        else:
            combined_embeddings = person_embeddings_array
        
        combined_metadata = existing_metadata + person_metadata
        
        # 즉시 저장
        np.save(embeddings_file, combined_embeddings)
        with open(metadata_file, 'w', encoding='utf-8') as f:
            for meta in combined_metadata:
                f.write(json.dumps(meta, ensure_ascii=False) + '\n')
        
        # 다음 반복을 위해 업데이트
        existing_embeddings = combined_embeddings
        existing_metadata = combined_metadata
        
        elapsed_time = time.time() - start_time
        print(f"  ✓ Done! Processed {len(person_metadata)} articles (Total: {len(combined_metadata)} articles)")
        print(f"  ⏱ Time taken: {elapsed_time:.2f} seconds")
    
    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

print(f"\nAll processing complete for GPU {GPU_ID}!")
print(f"Final embeddings shape: {existing_embeddings.shape}")
print(f"Total articles: {len(existing_metadata)}")

## 모든 GPU 결과 합치기
**4개 GPU 모두 완료된 후에 실행**
- 일단은 1개 GPU만 실행

In [None]:
# 모든 GPU 결과 합치기
print("Merging results from all GPUs...\n")

all_embeddings = []
all_metadata = []

for gpu_id in range(1):
    gpu_dir = Path(f"vector_bodyText_gpu{gpu_id}")
    emb_file = gpu_dir / "embeddings.npy"
    meta_file = gpu_dir / "metadata.jsonl"
    
    if not emb_file.exists() or not meta_file.exists():
        print(f"⚠️  GPU {gpu_id}: Files not found, skipping...")
        continue
    
    # Load embeddings
    emb = np.load(emb_file)
    all_embeddings.append(emb)
    print(f"✓ GPU {gpu_id}: Loaded {emb.shape[0]} embeddings")
    
    # Load metadata
    with open(meta_file, 'r', encoding='utf-8') as f:
        for line in f:
            all_metadata.append(json.loads(line))

# 합치기
if all_embeddings:
    final_embeddings = np.vstack(all_embeddings)
    
    # 최종 결과 저장
    final_dir = Path("vector_bodyText")
    final_dir.mkdir(exist_ok=True)
    
    np.save(final_dir / "embeddings.npy", final_embeddings)
    with open(final_dir / "metadata.jsonl", 'w', encoding='utf-8') as f:
        for meta in all_metadata:
            f.write(json.dumps(meta, ensure_ascii=False) + '\n')
    
    print(f"\n{'='*50}")
    print(f"✅ Merge complete!")
    print(f"Final embeddings shape: {final_embeddings.shape}")
    print(f"Total articles: {len(all_metadata)}")
    print(f"Saved to: {final_dir}/")
    print(f"{'='*50}")
else:
    print("\n❌ No data found to merge!")

Merging results from all GPUs...

✓ GPU 0: Loaded 225145 embeddings
✓ GPU 1: Loaded 235577 embeddings

✅ Merge complete!
Final embeddings shape: (460722, 1024)
Total articles: 460722
Saved to: vector_bodyText/
