In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
# 라이브러리 import
import json
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ===== GPU 분할 설정 (이 부분만 각 파일마다 다르게!) =====
GPU_ID = 0
START_IDX = 0
END_IDX = 100  # 25명씩 분할
# ======================================================

# 설정
MODEL_NAME = "jinaai/jina-embeddings-v3"
DATA_DIR = Path("guardian_top100_scraping")
OUTPUT_DIR = Path(f"vector_paragraphs_gpu{GPU_ID}")
BATCH_SIZE = 8
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

OUTPUT_DIR.mkdir(exist_ok=True)
print(f"GPU {GPU_ID}: Processing people {START_IDX} to {END_IDX-1}")

GPU 0: Processing people 0 to 99


In [4]:
def preprocess_text_first_last(text):
    """
    기사의 첫 문단과 마지막 문단만 추출하여 결합
    """
    if not text or text.strip() == '':
        return None
    
    # 줄바꿈을 기준으로 문단 분리 (공백 라인 제거)
    paragraphs = [line.strip() for line in text.split('\n') if line.strip()]
    
    if not paragraphs:
        return None
    
    if len(paragraphs) == 1:
        # 문단이 하나뿐이면 그것만 사용
        return paragraphs[0]
    else:
        # 첫 문단 + 공백 + 마지막 문단
        return f"{paragraphs[0]} {paragraphs[-1]}"

In [5]:
# Model과 Tokenizer load
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = model.cuda()
model.eval()
print("Model loaded successfully!")

Loading model...


`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention impl

Model loaded successfully!


In [6]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출 (예: alex_morgan.jsonl -> alex_morgan)"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
    return dt.strftime("%Y_%m_%d")

@torch.no_grad()
def generate_embeddings(texts, batch_size=32):
    """Batch 단위로 embedding 생성 (truncation 자동 처리)"""
    embeddings = []
    
    # Progress bar 추가
    for i in tqdm(range(0, len(texts), batch_size), desc="    Embedding batches", leave=False):
        batch = texts[i:i+batch_size]
        
        # Tokenize (truncation=True로 자동 처리)
        encoded = tokenizer(
            batch, 
            padding=True, 
            truncation=True,  # 8192 토큰 초과 시 자동 truncate
            max_length=8192, 
            return_tensors='pt'
        )
        encoded = {k: v.cuda() for k, v in encoded.items()}
        
        # Generate embeddings
        outputs = model(**encoded)
        # CLS token embedding 사용
        batch_embeddings = outputs.last_hidden_state[:, 0, :]
        
        # Normalize
        batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
        
        # BFloat16 -> Float32 변환 후 numpy로 변환
        batch_embeddings = batch_embeddings.to(torch.float32)
        
        embeddings.append(batch_embeddings.cpu().numpy())
    
    return np.vstack(embeddings)

def verify_lengths(person, embeddings_array, metadata_list):
    """Embeddings와 metadata 길이 확인"""
    emb_len = len(embeddings_array)
    meta_len = len(metadata_list)
    
    if emb_len == meta_len:
        print(f"  ✓ Length verification passed: {emb_len} entries")
        return True
    else:
        print(f"  ✗ Length mismatch! Embeddings: {emb_len}, Metadata: {meta_len}")
        return False

In [7]:
# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    with open(CHECKPOINT_FILE, 'r') as f:
        checkpoint = json.load(f)
        processed_files = set(checkpoint.get('processed_files', []))
        print(f"Checkpoint found: {len(processed_files)} files already processed")

Checkpoint found: 87 files already processed


In [8]:
# 모든 .jsonl 파일 수집 및 GPU별 분할
all_files = sorted([f for f in DATA_DIR.glob("*.jsonl")])
jsonl_files = all_files[START_IDX:END_IDX]
jsonl_files = [f for f in jsonl_files if f.name not in processed_files]

print(f"Total files to process in this GPU: {len(jsonl_files)}")

Total files to process in this GPU: 13


In [9]:
# 데이터 수집 및 embedding 생성 (인물별로 처리)
for idx, file_path in enumerate(jsonl_files):
    person = extract_person_name(file_path)
    
    # 파일에서 기사 읽기
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = [json.loads(line) for line in f]
    
    total_articles = len(articles)
    print(f"\n[{idx+1}/{len(jsonl_files)}] Processing: {person} (Total articles: {total_articles})")
    
    # 현재 인물의 body text와 metadata 추출
    person_texts = []
    person_metadata = []
    
    # Article 수집
    for article in articles:
        body_text = article.get('bodyText', '')
        article_id = article.get('id')
        pub_date_raw = article.get('webPublicationDate')
        
        if not all([body_text, article_id, pub_date_raw]):
            continue
        
        # 첫 문단 + 마지막 문단 추출
        processed_text = preprocess_text_first_last(body_text)
        
        if processed_text:
            person_texts.append(processed_text)
            person_metadata.append({
                'person': person,
                'article_id': article_id,
                'pub_date': parse_pub_date(pub_date_raw)
            })
    
    # 현재 인물의 embedding 생성
    if person_texts:
        print(f"  → Generating embeddings for {len(person_texts)} valid articles...")
        person_embeddings = generate_embeddings(person_texts, batch_size=BATCH_SIZE)
        
        # 길이 검증
        if not verify_lengths(person, person_embeddings, person_metadata):
            print(f"  ⚠️  Warning: Skipping save due to length mismatch")
            continue
        
        # 개별 파일로 저장
        person_emb_file = OUTPUT_DIR / f"{person}_embeddings.npy"
        person_meta_file = OUTPUT_DIR / f"{person}_metadata.jsonl"
        
        np.save(person_emb_file, person_embeddings)
        with open(person_meta_file, 'w', encoding='utf-8') as f:
            for meta in person_metadata:
                f.write(json.dumps(meta, ensure_ascii=False) + '\n')
        
        print(f"  ✓ Saved: {person_emb_file.name}, {person_meta_file.name}")
    
    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

print(f"\nAll processing complete for GPU {GPU_ID}!")


[1/13] Processing: sadiq_khan (Total articles: 4420)
  → Generating embeddings for 4420 valid articles...


                                                                      

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 23.53 GiB of which 4.24 GiB is free. Including non-PyTorch memory, this process has 19.28 GiB memory in use. Of the allocated memory 17.65 GiB is allocated by PyTorch, and 1.18 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Checkpoint 삭제
if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()
    print("Checkpoint file removed.")

print("\nAll done!")

In [None]:
# # ========== 모든 GPU 결과 합치기 (주석 처리) ==========
# print("Merging results from all GPUs...\n")

# all_embeddings = []
# all_metadata = []

# # GPU 개수에 맞게 수정
# for gpu_id in range(4):  # 예: 4개 GPU 사용
#     gpu_dir = Path(f"vector_paragraphs_gpu{gpu_id}")
#     
#     # 해당 GPU 디렉토리의 모든 person 파일 수집
#     person_emb_files = sorted(gpu_dir.glob("*_embeddings.npy"))
#     
#     if not person_emb_files:
#         print(f"⚠️  GPU {gpu_id}: No files found, skipping...")
#         continue
#     
#     print(f"GPU {gpu_id}: Found {len(person_emb_files)} people")
#     
#     for emb_file in person_emb_files:
#         person = emb_file.stem.replace('_embeddings', '')
#         meta_file = gpu_dir / f"{person}_metadata.jsonl"
#         
#         if not meta_file.exists():
#             print(f"  ⚠️  {person}: metadata file not found, skipping...")
#             continue
#         
#         # Load embeddings
#         emb = np.load(emb_file)
#         all_embeddings.append(emb)
#         
#         # Load metadata
#         with open(meta_file, 'r', encoding='utf-8') as f:
#             for line in f:
#                 all_metadata.append(json.loads(line))
#     
#     print(f"  ✓ GPU {gpu_id}: Loaded {sum(len(e) for e in all_embeddings[-len(person_emb_files):])} total entries")

# # 합치기
# if all_embeddings:
#     final_embeddings = np.vstack(all_embeddings)
#     
#     # 길이 검증
#     print(f"\nFinal verification:")
#     print(f"  Embeddings shape: {final_embeddings.shape}")
#     print(f"  Metadata entries: {len(all_metadata)}")
#     
#     if final_embeddings.shape[0] != len(all_metadata):
#         print(f"  ✗ Length mismatch! Not saving.")
#     else:
#         # 최종 결과 저장
#         final_dir = Path("vector_paragraphs")
#         final_dir.mkdir(exist_ok=True)
#         
#         np.save(final_dir / "embeddings.npy", final_embeddings)
#         with open(final_dir / "metadata.jsonl", 'w', encoding='utf-8') as f:
#             for meta in all_metadata:
#                 f.write(json.dumps(meta, ensure_ascii=False) + '\n')
#         
#         print(f"\n{'='*50}")
#         print(f"✅ Merge complete!")
#         print(f"Final embeddings shape: {final_embeddings.shape}")
#         print(f"Total articles: {len(all_metadata)}")
#         print(f"Saved to: {final_dir}/")
#         print(f"{'='*50}")
# else:
#     print("\n❌ No data found to merge!")