In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
# 라이브러리 import
import json
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime

In [None]:
# 설정
MODEL_NAME = "BAAI/bge-large-en-v1.5"
DATA_DIR = Path("../guardian_top100_scraping")
OUTPUT_DIR = Path("vector_headlines")
BATCH_SIZE = 32
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
# Model과 Tokenizer load
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.cuda()
model.eval()
print("Model loaded successfully!")

In [None]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출 (예: alex_morgan.jsonl -> alex_morgan)"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
    return dt.strftime("%Y_%m_%d")

@torch.no_grad()
def generate_embeddings(texts, batch_size=32):
    """Batch 단위로 embedding 생성"""
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(batch, padding=True, truncation=True, 
                          max_length=512, return_tensors='pt')
        encoded = {k: v.cuda() for k, v in encoded.items()}
        
        # Generate embeddings
        outputs = model(**encoded)
        # CLS token embedding 사용
        batch_embeddings = outputs.last_hidden_state[:, 0, :]
        
        # Normalize
        batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
        
        embeddings.append(batch_embeddings.cpu().numpy())
    
    return np.vstack(embeddings)

In [None]:
# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    with open(CHECKPOINT_FILE, 'r') as f:
        checkpoint = json.load(f)
        processed_files = set(checkpoint.get('processed_files', []))
        print(f"Checkpoint found: {len(processed_files)} files already processed")

In [None]:
# 모든 .jsonl 파일 수집
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
print(f"Total files to process: {len(jsonl_files)}")

In [None]:
# 기존 저장된 데이터 로드 (있다면)
embeddings_file = OUTPUT_DIR / "embeddings.npy"
metadata_file = OUTPUT_DIR / "metadata.jsonl"

if embeddings_file.exists() and metadata_file.exists():
    print("Loading existing data...")
    existing_embeddings = np.load(embeddings_file)
    existing_metadata = []
    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            existing_metadata.append(json.loads(line))
    print(f"Loaded {len(existing_metadata)} existing entries")
else:
    existing_embeddings = None
    existing_metadata = []
    print("Starting fresh (no existing data found)")

In [None]:
# 데이터 수집 및 embedding 생성 (인물별로 처리)
for idx, file_path in enumerate(jsonl_files):
    person = extract_person_name(file_path)
    print(f"\n[{idx+1}/{len(jsonl_files)}] Processing: {person}")
    
    # 파일에서 기사 읽기
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = [json.loads(line) for line in f]
    
    # 현재 인물의 headline과 metadata 추출
    person_headlines = []
    person_metadata = []
    
    for article in articles:
        headline = article.get('headline', '')
        if headline:  # headline이 있는 경우만 처리
            person_headlines.append(headline)
            person_metadata.append({
                'person': person,
                'article_id': article.get('id', ''),
                'pub_date': parse_pub_date(article.get('webPublicationDate', '')),
                'headline': headline
            })
    
    # 현재 인물의 embedding 생성
    if person_headlines:
        print(f"  → Generating embeddings for {len(person_headlines)} headlines...")
        person_embeddings = generate_embeddings(person_headlines, batch_size=BATCH_SIZE)
        
        # 기존 데이터와 합치기
        if existing_embeddings is not None:
            combined_embeddings = np.vstack([existing_embeddings, person_embeddings])
        else:
            combined_embeddings = person_embeddings
        
        combined_metadata = existing_metadata + person_metadata
        
        # 즉시 저장
        np.save(embeddings_file, combined_embeddings)
        with open(metadata_file, 'w', encoding='utf-8') as f:
            for meta in combined_metadata:
                f.write(json.dumps(meta, ensure_ascii=False) + '\n')
        
        # 다음 반복을 위해 업데이트
        existing_embeddings = combined_embeddings
        existing_metadata = combined_metadata
        
        print(f"  ✓ Done! (Total: {len(combined_metadata)} headlines processed)")
    
    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

print(f"\nAll processing complete!")
print(f"Final embeddings shape: {existing_embeddings.shape}")
print(f"Total headlines: {len(existing_metadata)}")

In [None]:
# Checkpoint 삭제
if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()
    print("Checkpoint file removed.")

print("\nAll done!")