In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
# 라이브러리 import
import json
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 설정
MODEL_NAME = "BAAI/bge-large-en-v1.5"
DATA_DIR = Path("../guardian_top100_scraping")
OUTPUT_DIR = Path("vector_chunking")
BATCH_SIZE = 64
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

# Chunking 설정
CHUNK_SIZE = 10  # 열 문장씩
OVERLAP = 3      # 세 문장씩 overlap

OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
# Model과 Tokenizer load
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.cuda()
model.eval()
print("Model loaded successfully!")

Loading model...
Model loaded successfully!


In [None]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
    return dt.strftime("%Y_%m_%d")

def split_into_sentences(text):
    """텍스트를 문장 단위로 분리 (. 기준)"""
    sentences = [s.strip() for s in text.split('. ') if s.strip()]
    return sentences

def create_chunks(sentences, chunk_size=2, overlap=1):
    """문장들을 chunk_size만큼 묶고, overlap만큼 겹치게 생성"""
    chunks = []
    step = chunk_size - overlap
    
    for i in range(0, len(sentences), step):
        chunk_sentences = sentences[i:i+chunk_size]
        if chunk_sentences:
            chunk_text = '. '.join(chunk_sentences) + '.'
            chunks.append(chunk_text)
        
        if i + chunk_size >= len(sentences):
            break
    
    return chunks

@torch.no_grad()
def generate_embeddings(texts, batch_size=32):
    """Batch 단위로 embedding 생성"""
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(batch, padding=True, truncation=True, 
                          max_length=512, return_tensors='pt')
        encoded = {k: v.cuda() for k, v in encoded.items()}
        
        # Generate embeddings
        outputs = model(**encoded)
        # CLS token embedding 사용
        batch_embeddings = outputs.last_hidden_state[:, 0, :]
        
        # Normalize
        batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
        
        embeddings.append(batch_embeddings.cpu().numpy())
    
    return np.vstack(embeddings)

def mean_pool_embeddings(embeddings):
    """여러 chunk embeddings를 mean pooling"""
    return np.mean(embeddings, axis=0)

In [None]:
# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    with open(CHECKPOINT_FILE, 'r') as f:
        checkpoint = json.load(f)
        processed_files = set(checkpoint.get('processed_files', []))
        print(f"Checkpoint found: {len(processed_files)} files already processed")

Checkpoint found: 84 files already processed


In [None]:
# 모든 .jsonl 파일 수집
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
print(f"Total files to process: {len(jsonl_files)}")

Total files to process: 16


In [None]:
# 기존 저장된 데이터 로드 (있다면)
embeddings_file = OUTPUT_DIR / "embeddings.npy"
metadata_file = OUTPUT_DIR / "metadata.jsonl"

if embeddings_file.exists() and metadata_file.exists():
    print("Loading existing data...")
    existing_embeddings = np.load(embeddings_file)
    existing_metadata = []
    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            existing_metadata.append(json.loads(line))
    print(f"Loaded {len(existing_metadata)} existing entries")
else:
    existing_embeddings = None
    existing_metadata = []
    print("Starting fresh (no existing data found)")

Loading existing data...
Loaded 363740 existing entries


In [None]:
# 데이터 수집 및 embedding 생성 (인물별로 처리)
for idx, file_path in enumerate(jsonl_files):
    person = extract_person_name(file_path)
    print(f"\n[{idx+1}/{len(jsonl_files)}] Processing: {person}")
    
    # 파일에서 기사 읽기
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = [json.loads(line) for line in f]
    
    # 현재 인물의 기사별 처리
    person_embeddings = []
    person_metadata = []
    
    for article in articles:
        body_text = article.get('bodyText', '')
        if not body_text:
            continue
        
        # 문장 분리
        sentences = split_into_sentences(body_text)
        if not sentences:
            continue
        
        # Chunks 생성
        chunks = create_chunks(sentences, chunk_size=CHUNK_SIZE, overlap=OVERLAP)
        if not chunks:
            continue
        
        # 각 chunk의 embedding 생성
        chunk_embeddings = generate_embeddings(chunks, batch_size=BATCH_SIZE)
        
        # Mean pooling: 한 기사의 모든 chunk embeddings를 평균
        article_embedding = mean_pool_embeddings(chunk_embeddings)
        
        person_embeddings.append(article_embedding)
        person_metadata.append({
            'person': person,
            'article_id': article.get('id', ''),
            'pub_date': parse_pub_date(article.get('webPublicationDate', ''))
        })
    
    # 현재 인물의 결과 저장
    if person_embeddings:
        person_embeddings_array = np.array(person_embeddings)
        
        # 기존 데이터와 합치기
        if existing_embeddings is not None:
            combined_embeddings = np.vstack([existing_embeddings, person_embeddings_array])
        else:
            combined_embeddings = person_embeddings_array
        
        combined_metadata = existing_metadata + person_metadata
        
        # 즉시 저장
        np.save(embeddings_file, combined_embeddings)
        with open(metadata_file, 'w', encoding='utf-8') as f:
            for meta in combined_metadata:
                f.write(json.dumps(meta, ensure_ascii=False) + '\n')
        
        # 다음 반복을 위해 업데이트
        existing_embeddings = combined_embeddings
        existing_metadata = combined_metadata
        
        print(f"  ✓ Done! Processed {len(person_metadata)} articles (Total: {len(combined_metadata)} articles)")
    
    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

print(f"\nAll processing complete!")
print(f"Final embeddings shape: {existing_embeddings.shape}")
print(f"Total articles: {len(existing_metadata)}")

# alex morgan -> 9m 30s


[1/16] Processing: ryan_coogler
  ✓ Done! Processed 7441 articles (Total: 371181 articles)

[2/16] Processing: ryan_murphy
  ✓ Done! Processed 10452 articles (Total: 381633 articles)

[3/16] Processing: ryan_reynolds
  ✓ Done! Processed 8466 articles (Total: 390099 articles)

[4/16] Processing: sadiq_khan
  ✓ Done! Processed 4420 articles (Total: 394519 articles)

[5/16] Processing: samantha_bee
  ✓ Done! Processed 2264 articles (Total: 396783 articles)

[6/16] Processing: sandra_day_oconnor
  ✓ Done! Processed 3663 articles (Total: 400446 articles)

[7/16] Processing: sean_hannity
  ✓ Done! Processed 5331 articles (Total: 405777 articles)

[8/16] Processing: spike_lee
  ✓ Done! Processed 2341 articles (Total: 408118 articles)

[9/16] Processing: sterling_brown
  ✓ Done! Processed 4010 articles (Total: 412128 articles)

[10/16] Processing: taylor_swift
  ✓ Done! Processed 3762 articles (Total: 415890 articles)

[11/16] Processing: theresa_may
  ✓ Done! Processed 22164 articles (Total:

In [None]:
# Checkpoint 삭제
if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()
    print("Checkpoint file removed.")

print("\nAll done!")

Checkpoint file removed.

All done!
