**Cell 0**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Cell 1: 환경 설정 및 라이브러리 임포트**

In [None]:
import os
import json
import numpy as np
from pathlib import Path
from transformers import AutoModel
import torch
from datetime import datetime
from tqdm.auto import tqdm
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# GPU 설정
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

**Cell 2: 경로 및 모델 설정**

In [None]:
# === 설정 ===
MODEL_NAME = "jinaai/jina-embeddings-v3"
DATA_DIR = Path("/content/drive/MyDrive/COSE362/data/guardian_top100_scraping")
OUTPUT_DIR = Path("/content/drive/MyDrive/COSE362/data/vector_paragraphs")
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

# Jina v3 (1024차원) 배치 사이즈 설정
BATCH_SIZE = 16

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"Output Directory: {OUTPUT_DIR}")

**Cell 3: 모델 로드**

In [None]:
# Model Load
print(f"Loading model: {MODEL_NAME} ...")

# trust_remote_code=True 필수
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = model.to(DEVICE)
model.eval()
print("Model loaded successfully!")

**Cell 4: 헬퍼 함수 정의**

In [None]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    try:
        dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
        return dt.strftime("%Y_%m_%d")
    except Exception:
        return None

def preprocess_text_first_last(text):
    """
    기사의 첫 문단과 마지막 문단만 추출하여 결합
    """
    if not text or text.strip() == '':
        return None

    # 줄바꿈 기준 문단 분리
    paragraphs = [line.strip() for line in text.split('\n') if line.strip()]

    if not paragraphs:
        return None

    if len(paragraphs) == 1:
        # 문단이 하나뿐이면 그것만 사용
        return paragraphs[0]
    else:
        # 첫 문단 + 공백 + 마지막 문단
        return f"{paragraphs[0]} {paragraphs[-1]}"

@torch.no_grad()
def generate_embeddings_jina(texts, batch_size=16):
    """Jina v3 전용 임베딩 생성 (task='retrieval.passage')"""
    embeddings = model.encode(
        texts,
        task="retrieval.passage",
        batch_size=batch_size,
        show_progress_bar=False
    )
    return embeddings

**Cell 5:  체크포인트 및 데이터 정합성 검사**

In [None]:
# 1. Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint = json.load(f)
            processed_files = set(checkpoint.get('processed_files', []))
            print(f"Checkpoint found: {len(processed_files)} files already processed")
    except json.JSONDecodeError:
        print("Checkpoint corrupted. Starting fresh.")
        processed_files = set()

# 2. 기존 데이터 정합성 확인
embeddings_file = OUTPUT_DIR / "embeddings.npy"
metadata_file = OUTPUT_DIR / "metadata.jsonl"

if embeddings_file.exists() and metadata_file.exists():
    print("Checking existing data consistency...")
    try:
        # 메타데이터 라인 수 계산
        with open(metadata_file, 'r', encoding='utf-8') as f:
            meta_count = sum(1 for _ in f)

        # 임베딩 로드하여 개수 확인
        temp_emb = np.load(embeddings_file)
        emb_count = len(temp_emb)
        del temp_emb # 메모리 해제

        if emb_count == meta_count:
            print(f"Data is consistent. {meta_count} rows loaded. Resuming...")
        else:
            print(f"Mismatch detected! Emb: {emb_count}, Meta: {meta_count}. Resetting data.")
            if embeddings_file.exists(): os.remove(embeddings_file)
            if metadata_file.exists(): os.remove(metadata_file)
            processed_files = set()

    except Exception as e:
        print(f"Error checking data: {e}. Resetting.")
        if embeddings_file.exists(): os.remove(embeddings_file)
        if metadata_file.exists(): os.remove(metadata_file)
        processed_files = set()

elif embeddings_file.exists() or metadata_file.exists():
    # 둘 중 하나만 있으면 꼬인 것이므로 삭제
    print("File pair incomplete. Deleting and starting fresh.")
    if embeddings_file.exists(): os.remove(embeddings_file)
    if metadata_file.exists(): os.remove(metadata_file)
    processed_files = set()

# 처리할 파일 목록 생성
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
print(f"Total files to process: {len(jsonl_files)}")

**Cell 6: 메인 루프 (실행 및 저장)**

In [None]:
for idx, file_path in enumerate(tqdm(jsonl_files, desc="Processing Files")):
    person = extract_person_name(file_path)

    # 1. 파일 읽기
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            articles = [json.loads(line) for line in f]
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        continue

    valid_texts = []
    valid_metadata = []

    # 2. 전처리 (첫문단 + 끝문단)
    for article in articles:
        body_text = article.get('bodyText', '')
        article_id = article.get('id')
        pub_date_raw = article.get('webPublicationDate')

        if not all([body_text, article_id, pub_date_raw]):
            continue

        processed_text = preprocess_text_first_last(body_text)

        if processed_text:
            valid_texts.append(processed_text)
            valid_metadata.append({
                'person': person,
                'article_id': article_id,
                'pub_date': parse_pub_date(pub_date_raw)
            })

    # 3. 임베딩 생성 및 저장
    if valid_texts:
        try:
            # 임베딩 생성
            new_embeddings = generate_embeddings_jina(valid_texts, batch_size=BATCH_SIZE)

            if new_embeddings.size > 0:
                # 기존 파일 로드 -> 합치기(vstack) -> 저장
                if embeddings_file.exists():
                    current_emb = np.load(embeddings_file)
                    updated_emb = np.vstack([current_emb, new_embeddings])
                    np.save(embeddings_file, updated_emb)
                    del current_emb, updated_emb # 메모리 정리
                else:
                    np.save(embeddings_file, new_embeddings)

                # 메타데이터 이어쓰기 ('a' mode)
                with open(metadata_file, 'a', encoding='utf-8') as f:
                    for meta in valid_metadata:
                        f.write(json.dumps(meta, ensure_ascii=False) + '\n')

        except Exception as e:
            print(f"Failed to save data for {person}: {e}")
            continue

    # 4. 체크포인트 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

print("\nProcessing complete!")

**Cell 7: 결과 확인 및 정리**

In [None]:
# 최종 결과 확인
if embeddings_file.exists():
    final_emb = np.load(embeddings_file)
    print(f"Final embeddings shape: {final_emb.shape}")

    with open(metadata_file, 'r', encoding='utf-8') as f:
        final_meta_count = sum(1 for _ in f)
    print(f"Total articles in metadata: {final_meta_count}")

# 체크포인트 파일 삭제 (완료되었으므로)
if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()
    print("Checkpoint file removed.")