In [None]:
from google.colab import drive
drive.mount('/content/drive')

**all-MiniLM-L6-v2**

In [None]:
import os
import json
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime
from tqdm import tqdm
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 0. GPU 설정
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {DEVICE}")

# === 모델 및 설정 변경 ===
# Output Dimension: 384
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# 데이터 경로
DATA_DIR = Path("/content/drive/MyDrive/COSE362/data/guardian_top100_scraping")

# 출력 경로
OUTPUT_DIR = Path("/content/drive/MyDrive/COSE362/data/vector_paragraph_1")
BATCH_SIZE = 32
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

OUTPUT_DIR.mkdir(exist_ok=True)

# 3. Model과 Tokenizer 로드
logging.info(f"Loading model: {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(DEVICE)
model.eval()
logging.info("Model loaded successfully!")

# --- Helper Functions ---

def extract_person_name(filename):
    """파일명에서 person 이름 추출 (예: alex_morgan.jsonl -> alex_morgan)"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    try:
        dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
        return dt.strftime("%Y_%m_%d")
    except Exception:
        return None

def preprocess_text_first_last(text):
    """
    기사의 첫 문단과 마지막 문단만 추출하여 결합
    """
    if not text or text.strip() == '':
        return None

    # 줄바꿈을 기준으로 문단 분리 (공백 라인 제거)
    paragraphs = [line.strip() for line in text.split('\n') if line.strip()]

    if not paragraphs:
        return None

    if len(paragraphs) == 1:
        # 문단이 하나뿐이면 그것만 사용
        return paragraphs[0]
    else:
        # 첫 문단 + 공백 + 마지막 문단
        return f"{paragraphs[0]} {paragraphs[-1]}"

def mean_pooling(model_output, attention_mask):
    """
    sentence-transformers 모델을 위한 Mean Pooling 함수
    (모든 토큰의 임베딩을 평균내어 문장 임베딩 생성)
    """
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

@torch.no_grad()
def generate_embeddings(texts, model, tokenizer, batch_size=32):
    """Batch 단위로 embedding 생성 (Mean Pooling 적용)"""
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        # Tokenize
        encoded = tokenizer(batch, padding=True, truncation=True,
                            max_length=512, return_tensors='pt')
        encoded = {k: v.to(DEVICE) for k, v in encoded.items()}

        # Model Inference
        outputs = model(**encoded)

        # CLS 토큰 대신 Mean Pooling 사용
        batch_embeddings = mean_pooling(outputs, encoded['attention_mask'])

        # Normalize
        batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)

        all_embeddings.append(batch_embeddings.cpu().numpy())

    if not all_embeddings:
        return np.array([])

    return np.vstack(all_embeddings)

# --- Main Execution Logic (Checkpoint & Loop) ---

# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint = json.load(f)
            processed_files = set(checkpoint.get('processed_files', []))
            logging.info(f"Checkpoint found: {len(processed_files)} files already processed")
    except json.JSONDecodeError:
        logging.warning("Checkpoint file is corrupted. Starting fresh.")
        processed_files = set()

# 기존 저장된 데이터 로드 (Append 모드 지원)
embeddings_file = OUTPUT_DIR / "embeddings.npy"
metadata_file = OUTPUT_DIR / "metadata.jsonl"

existing_embeddings = None
existing_metadata = []

if embeddings_file.exists() and metadata_file.exists():
    logging.info("Loading existing data...")
    try:
        existing_embeddings = np.load(embeddings_file)
        with open(metadata_file, 'r', encoding='utf-8') as f:
            for line in f:
                existing_metadata.append(json.loads(line))

        if len(existing_embeddings) == len(existing_metadata):
            logging.info(f"Loaded {len(existing_metadata)} existing entries")
        else:
            logging.error("Data mismatch! Starting fresh.")
            existing_embeddings = None
            existing_metadata = []
    except Exception as e:
        logging.error(f"Error loading existing data: {e}. Starting fresh.")
        existing_embeddings = None
        existing_metadata = []
else:
    logging.info("Starting fresh (no existing data found)")

# 처리할 파일 목록 생성
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
logging.info(f"Total files to process: {len(jsonl_files)}")

logging.info("전체 데이터 임베딩 작업을 시작합니다.")

# 메인 루프
for idx, file_path in enumerate(tqdm(jsonl_files, desc="Overall Progress")):
    person = extract_person_name(file_path)

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            articles = [json.loads(line) for line in f]
    except Exception as e:
        logging.error(f"Failed to read {file_path}: {e}")
        continue

    # 배치를 위한 데이터 수집
    valid_texts = []
    valid_metadata = []

    for article in articles:
        body_text = article.get('bodyText', '')
        article_id = article.get('id')
        pub_date_raw = article.get('webPublicationDate')

        if not all([body_text, article_id, pub_date_raw]):
            continue

        # 첫 문단 + 마지막 문단 추출
        processed_text = preprocess_text_first_last(body_text)

        if processed_text:
            valid_texts.append(processed_text)
            valid_metadata.append({
                'person': person,
                'article_id': article_id,
                'pub_date': parse_pub_date(pub_date_raw)
            })

    # 임베딩 생성 및 저장
    if valid_texts:
        # 한 파일 내의 모든 기사를 배치 처리
        new_embeddings = generate_embeddings(valid_texts, model, tokenizer, batch_size=BATCH_SIZE)

        if new_embeddings.size > 0:
            # 기존 데이터와 병합
            if existing_embeddings is not None:
                combined_embeddings = np.vstack([existing_embeddings, new_embeddings])
            else:
                combined_embeddings = new_embeddings

            combined_metadata = existing_metadata + valid_metadata

            # 즉시 저장 (덮어쓰기 방식)
            try:
                np.save(embeddings_file, combined_embeddings)
                with open(metadata_file, 'w', encoding='utf-8') as f:
                    for meta in combined_metadata:
                        f.write(json.dumps(meta, ensure_ascii=False) + '\n')

                # 메모리 갱신
                existing_embeddings = combined_embeddings
                existing_metadata = combined_metadata

            except Exception as e:
                logging.error(f"Failed to save data for {person}: {e}")
                continue

    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

logging.info(f"\n All processing complete!")
if existing_embeddings is not None:
    logging.info(f"Final embeddings shape: {existing_embeddings.shape}")
    logging.info(f"Total articles: {len(existing_metadata)}")

if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()
    logging.info("Checkpoint file removed.")

**jina-embeddings-v3**

In [None]:
import os
import json
import numpy as np
from pathlib import Path
from transformers import AutoModel
import torch
from datetime import datetime
from tqdm import tqdm
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 0. GPU 설정
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {DEVICE}")

# === 모델 및 설정 변경 ===
# Jina Embeddings v3 (8192 context length, 1024 dimension)
MODEL_NAME = "jinaai/jina-embeddings-v3"

# 데이터 경로
DATA_DIR = Path("/content/drive/MyDrive/COSE362/data/guardian_top100_scraping")

# 출력 경로
OUTPUT_DIR = Path("/content/drive/MyDrive/COSE362/data/vector_paragraph_2")
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

OUTPUT_DIR.mkdir(exist_ok=True)

# 3. Model 로드
# Jina v3는 trust_remote_code=True가 필수
logging.info(f"Loading model: {MODEL_NAME} ...")
# Tokenizer는 model.encode 내부에서 처리되므로 별도로 불러오지 않아도 됨
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = model.to(DEVICE)
model.eval()
logging.info("Model loaded successfully!")

# --- Helper Functions ---

def extract_person_name(filename):
    """파일명에서 person 이름 추출"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    try:
        dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
        return dt.strftime("%Y_%m_%d")
    except Exception:
        return None

def preprocess_text_first_last(text):
    """
    기사의 첫 문단과 마지막 문단만 추출하여 결합
    """
    if not text or text.strip() == '':
        return None

    # 줄바꿈을 기준으로 문단 분리 (공백 라인 제거)
    paragraphs = [line.strip() for line in text.split('\n') if line.strip()]

    if not paragraphs:
        return None

    if len(paragraphs) == 1:
        # 문단이 하나뿐이면 그것만 사용
        return paragraphs[0]
    else:
        # 첫 문단 + 공백 + 마지막 문단
        return f"{paragraphs[0]} {paragraphs[-1]}"

@torch.no_grad()
def generate_embeddings_jina(texts, model, batch_size=32):
    """
    Jina v3 전용 임베딩 생성 함수
    - task="retrieval.passage": 문서(Passage) 임베딩 모드 사용
    - model.encode()가 배치 처리 및 토크나이징을 자동으로 수행함
    """
    # Jina 모델의 내장 encode 함수 사용 (NumPy array 반환)
    # max_length는 모델의 최대 길이(8192)를 따름
    embeddings = model.encode(
        texts,
        task="retrieval.passage",
        batch_size=batch_size,
        show_progress_bar=False
    )

    return embeddings

# --- Main Execution Logic (Checkpoint & Loop) ---

# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint = json.load(f)
            processed_files = set(checkpoint.get('processed_files', []))
            logging.info(f"Checkpoint found: {len(processed_files)} files already processed")
    except json.JSONDecodeError:
        logging.warning("Checkpoint file is corrupted. Starting fresh.")
        processed_files = set()

# 기존 저장된 데이터 로드 (Append 모드 지원)
embeddings_file = OUTPUT_DIR / "embeddings.npy"
metadata_file = OUTPUT_DIR / "metadata.jsonl"

existing_embeddings = None
existing_metadata = []

if embeddings_file.exists() and metadata_file.exists():
    logging.info("Loading existing data...")
    try:
        existing_embeddings = np.load(embeddings_file)
        with open(metadata_file, 'r', encoding='utf-8') as f:
            for line in f:
                existing_metadata.append(json.loads(line))

        if len(existing_embeddings) == len(existing_metadata):
            logging.info(f"Loaded {len(existing_metadata)} existing entries")
        else:
            logging.error("Data mismatch! Starting fresh.")
            existing_embeddings = None
            existing_metadata = []
    except Exception as e:
        logging.error(f"Error loading existing data: {e}. Starting fresh.")
        existing_embeddings = None
        existing_metadata = []
else:
    logging.info("Starting fresh (no existing data found)")

# 처리할 파일 목록 생성
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
logging.info(f"Total files to process: {len(jsonl_files)}")

logging.info("전체 데이터 임베딩 작업을 시작합니다 (Jina v3).")

# 메인 루프
# Jina v3는 8192 토큰까지 처리 가능하므로 배치 사이즈를 너무 크게 잡으면 OOM 발생 가능 (조절 필요 시 batch_size 축소)
BATCH_SIZE_JINA = 16

for idx, file_path in enumerate(tqdm(jsonl_files, desc="Overall Progress")):
    person = extract_person_name(file_path)

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            articles = [json.loads(line) for line in f]
    except Exception as e:
        logging.error(f"Failed to read {file_path}: {e}")
        continue

    # 배치를 위한 데이터 수집
    valid_texts = []
    valid_metadata = []

    for article in articles:
        body_text = article.get('bodyText', '')
        article_id = article.get('id')
        pub_date_raw = article.get('webPublicationDate')

        if not all([body_text, article_id, pub_date_raw]):
            continue

        # 첫 문단 + 마지막 문단 추출
        processed_text = preprocess_text_first_last(body_text)

        if processed_text:
            valid_texts.append(processed_text)
            valid_metadata.append({
                'person': person,
                'article_id': article_id,
                'pub_date': parse_pub_date(pub_date_raw)
            })

    # 임베딩 생성 및 저장
    if valid_texts:
        try:
            # Jina 모델로 임베딩 생성
            new_embeddings = generate_embeddings_jina(valid_texts, model, batch_size=BATCH_SIZE_JINA)

            if new_embeddings.size > 0:
                # 기존 데이터와 병합
                if existing_embeddings is not None:
                    combined_embeddings = np.vstack([existing_embeddings, new_embeddings])
                else:
                    combined_embeddings = new_embeddings

                combined_metadata = existing_metadata + valid_metadata

                # 즉시 저장 (덮어쓰기 방식)
                np.save(embeddings_file, combined_embeddings)
                with open(metadata_file, 'w', encoding='utf-8') as f:
                    for meta in combined_metadata:
                        f.write(json.dumps(meta, ensure_ascii=False) + '\n')

                # 메모리 갱신
                existing_embeddings = combined_embeddings
                existing_metadata = combined_metadata

        except Exception as e:
            logging.error(f"Failed to save data for {person}: {e}")
            continue

    # Checkpoint 업데이트
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

logging.info(f"\n All processing complete!")
if existing_embeddings is not None:
    logging.info(f"Final embeddings shape: {existing_embeddings.shape}")
    logging.info(f"Total articles: {len(existing_metadata)}")

if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()
    logging.info("Checkpoint file removed.")