In [9]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [10]:
# 필요한 라이브러리 import
import json
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime

In [11]:
# 설정
MODEL_NAME = "BAAI/bge-large-en-v1.5"
DATA_DIR = Path("guardian_top100_scraping")
OUTPUT_DIR = Path("vector_headlines")
BATCH_SIZE = 32
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

OUTPUT_DIR.mkdir(exist_ok=True)

In [12]:
# Model과 Tokenizer load
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.cuda()
model.eval()
print("Model loaded successfully!")

Loading model...
Model loaded successfully!


In [13]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출 (예: alex_morgan.jsonl -> alex_morgan)"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
    return dt.strftime("%Y_%m_%d")

@torch.no_grad()
def generate_embeddings(texts, batch_size=32):
    """Batch 단위로 embedding 생성"""
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(batch, padding=True, truncation=True, 
                          max_length=512, return_tensors='pt')
        encoded = {k: v.cuda() for k, v in encoded.items()}
        
        # Generate embeddings
        outputs = model(**encoded)
        # CLS token embedding 사용
        batch_embeddings = outputs.last_hidden_state[:, 0, :]
        
        # Normalize
        batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
        
        embeddings.append(batch_embeddings.cpu().numpy())
    
    return np.vstack(embeddings)

In [14]:
# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    with open(CHECKPOINT_FILE, 'r') as f:
        checkpoint = json.load(f)
        processed_files = set(checkpoint.get('processed_files', []))
        print(f"Checkpoint found: {len(processed_files)} files already processed")

In [15]:
# 모든 .jsonl 파일 수집
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
print(f"Total files to process: {len(jsonl_files)}")

Total files to process: 100


In [16]:
# 데이터 수집 및 embedding 생성 (인물별로 처리)
all_embeddings = []
all_metadata = []

for idx, file_path in enumerate(jsonl_files):
    person = extract_person_name(file_path)
    print(f"\n[{idx+1}/{len(jsonl_files)}] Processing: {person}")
    
    # 파일에서 기사 읽기
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = [json.loads(line) for line in f]
    
    # 현재 인물의 headline과 metadata 추출
    person_headlines = []
    person_metadata = []
    
    for article in articles:
        headline = article.get('headline', '')
        if headline:  # headline이 있는 경우만 처리
            person_headlines.append(headline)
            person_metadata.append({
                'person': person,
                'article_id': article.get('id', ''),
                'pub_date': parse_pub_date(article.get('webPublicationDate', '')),
                'headline': headline
            })
    
    # 현재 인물의 embedding 생성
    if person_headlines:
        print(f"  → Generating embeddings for {len(person_headlines)} headlines...")
        person_embeddings = generate_embeddings(person_headlines, batch_size=BATCH_SIZE)
        all_embeddings.append(person_embeddings)
        all_metadata.extend(person_metadata)
        print(f"  ✓ Done! (Total: {len(all_metadata)} headlines processed)")
    
    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

# 모든 embeddings 합치기
print(f"\nCombining all embeddings...")
embeddings = np.vstack(all_embeddings)
print(f"Final embeddings shape: {embeddings.shape}")


[1/100] Processing: alex_morgan
  → Generating embeddings for 5833 headlines...
  ✓ Done! (Total: 5833 headlines processed)

[2/100] Processing: alicia_keys
  → Generating embeddings for 2545 headlines...
  ✓ Done! (Total: 8378 headlines processed)

[3/100] Processing: andres_manuel_lopez
  → Generating embeddings for 3247 headlines...
  ✓ Done! (Total: 11625 headlines processed)

[4/100] Processing: ann_mckee
  → Generating embeddings for 2467 headlines...
  ✓ Done! (Total: 14092 headlines processed)

[5/100] Processing: ashley_graham
  → Generating embeddings for 5074 headlines...
  ✓ Done! (Total: 19166 headlines processed)

[6/100] Processing: barbara_lynch
  → Generating embeddings for 3256 headlines...
  ✓ Done! (Total: 22422 headlines processed)

[7/100] Processing: barbara_rae_venter
  → Generating embeddings for 2635 headlines...
  ✓ Done! (Total: 25057 headlines processed)

[8/100] Processing: barry_jenkins
  → Generating embeddings for 6003 headlines...
  ✓ Done! (Total: 31

In [17]:
# 결과 저장
print("\nSaving results...")

# embeddings.npy 저장
np.save(OUTPUT_DIR / "embeddings.npy", embeddings)
print(f"Saved: {OUTPUT_DIR / 'embeddings.npy'}")

# metadata.jsonl 저장
with open(OUTPUT_DIR / "metadata.jsonl", 'w', encoding='utf-8') as f:
    for meta in all_metadata:
        f.write(json.dumps(meta, ensure_ascii=False) + '\n')
print(f"Saved: {OUTPUT_DIR / 'metadata.jsonl'}")

# Checkpoint 삭제 (완료되었으므로)
if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()

print("\nAll done!")


Saving results...
Saved: vector_headlines/embeddings.npy
Saved: vector_headlines/metadata.jsonl

All done!
