In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers tqdm



In [3]:
!unzip -q -o "/content/drive/My Drive/ML_team_project_final/guardian_top100_scraping.zip" -d "/content/"

Cell 1: 라이브러리 임포트 및 기본 설정

In [4]:
import os
import json
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime
from tqdm import tqdm
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 0. GPU 설정 (사용 가능한 GPU 번호로 변경하세요)
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {DEVICE}")

# 1. 설정
MODEL_NAME = "BAAI/bge-large-en-v1.5" # Headine/Chunking과 동일
DATA_DIR = Path("/content/guardian_top100_scraping")
OUTPUT_DIR = Path("/content/drive/My Drive/ML_team_project_final/vector_bodyText")
BATCH_SIZE = 32  # GPU 메모리에 따라 조절
CHECKPOINT_FILE = OUTPUT_DIR / "checkpoint.json"

# 2. BodyText Chunking 설정
# (512 - 2 [CLS, SEP])
CHUNK_LENGTH = 510
# 토큰 기준 50개 겹치기
OVERLAP = 50

OUTPUT_DIR.mkdir(exist_ok=True)

# 3. Model과 Tokenizer 로드
logging.info("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(DEVICE)
model.eval()
logging.info("Model loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Cell 2: 헬퍼 함수 정의 (핵심 로직)

In [5]:
def extract_person_name(filename):
    """파일명에서 person 이름 추출 (예: alex_morgan.jsonl -> alex_morgan)"""
    return filename.stem

def parse_pub_date(web_pub_date):
    """webPublicationDate를 YYYY_MM_DD 형식으로 변환"""
    try:
        dt = datetime.fromisoformat(web_pub_date.replace('Z', '+00:00'))
        return dt.strftime("%Y_%m_%d")
    except Exception:
        return None

def create_token_chunks(text, tokenizer, chunk_length=510, overlap=50):
    """
    긴 텍스트를 토큰 기준으로 'chunk_length'만큼 자르고 'overlap'만큼 겹치게 합니다.
    Sentence-Transformer의 방식(토큰화 -> 디코딩)을 차용합니다.
    """
    if not text or text.strip() == '':
        return []

    # 1. 전체 텍스트를 한 번에 토큰화 (특수 토큰 제외)
    tokens = tokenizer.encode(text, add_special_tokens=False)

    if not tokens:
        return []

    # 2. 토큰 리스트를 기준으로 겹치는 윈도우 생성
    step = chunk_length - overlap
    token_chunks = []

    for i in range(0, len(tokens), step):
        chunk = tokens[i:i + chunk_length]
        token_chunks.append(chunk)

        # 마지막 chunk가 전체 길이를 넘어서면 중단
        if i + chunk_length >= len(tokens):
            break

    # 3. 토큰 chunk들을 다시 텍스트(string)로 디코딩
    # BGE 모델은 [CLS] 토큰이 필요하므로, 디코딩 후 다시 encode 함수에 넣습니다.
    text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks if chunk]

    return text_chunks

@torch.no_grad()
def generate_embeddings(texts, model, tokenizer, batch_size=32):
    """Batch 단위로 embedding 생성"""
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        # Tokenize (BGE 모델은 max_length=512)
        encoded = tokenizer(batch, padding=True, truncation=True,
                            max_length=512, return_tensors='pt')
        encoded = {k: v.to(DEVICE) for k, v in encoded.items()}

        # Generate embeddings
        outputs = model(**encoded)

        # CLS token embedding 사용 및 Normalize
        batch_embeddings = outputs.last_hidden_state[:, 0, :]
        batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)

        embeddings.append(batch_embeddings.cpu().numpy())

    if not embeddings:
        return np.array([])

    return np.vstack(embeddings)

def mean_pool_embeddings(embeddings):
    """여러 chunk embeddings를 mean pooling"""
    if embeddings.size == 0:
        return None
    return np.mean(embeddings, axis=0)

Cell 3: 이어달리기(Checkpoint) 설정

In [6]:
# Checkpoint 확인
processed_files = set()
if CHECKPOINT_FILE.exists():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint = json.load(f)
            processed_files = set(checkpoint.get('processed_files', []))
            logging.info(f"Checkpoint found: {len(processed_files)} files already processed")
    except json.JSONDecodeError:
        logging.warning("Checkpoint file is corrupted. Starting fresh.")
        processed_files = set()

# 기존 저장된 데이터 로드
embeddings_file = OUTPUT_DIR / "embeddings.npy"
metadata_file = OUTPUT_DIR / "metadata.jsonl"

if embeddings_file.exists() and metadata_file.exists():
    logging.info("Loading existing data...")
    try:
        existing_embeddings = np.load(embeddings_file)
        existing_metadata = []
        with open(metadata_file, 'r', encoding='utf-8') as f:
            for line in f:
                existing_metadata.append(json.loads(line))

        if len(existing_embeddings) == len(existing_metadata):
            logging.info(f"Loaded {len(existing_metadata)} existing entries")
        else:
            logging.error("Data mismatch! embeddings and metadata have different lengths. Starting fresh.")
            existing_embeddings = None
            existing_metadata = []

    except Exception as e:
        logging.error(f"Error loading existing data: {e}. Starting fresh.")
        existing_embeddings = None
        existing_metadata = []
else:
    logging.info("Starting fresh (no existing data found)")
    existing_embeddings = None
    existing_metadata = []

# 모든 .jsonl 파일 수집 (처리되지 않은 파일만)
jsonl_files = sorted([f for f in DATA_DIR.glob("*.jsonl") if f.name not in processed_files])
logging.info(f"Total files to process: {len(jsonl_files)}")

Cell 4: 메인 루프 (데이터 처리 및 저장)

In [7]:
# === 테스트용 Main Loop (기사 5개만 처리하고 멈춤) ===

logging.info("테스트 모드를 시작합니다. (기사 5개만 처리)")

test_count = 0
target_count = 5  # 테스트할 기사 개수

# 전체 파일 중 '첫 번째 파일'만 가져옵니다.
test_files = jsonl_files[:1]

for file_path in test_files:
    person = extract_person_name(file_path)
    print(f"\n 파일 열기: {file_path.name}")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            # 파일 전체를 읽지 않고 한 줄씩 읽으며 테스트
            for line in f:
                if test_count >= target_count:
                    break  # 5개 채우면 중단

                article = json.loads(line)
                body_text = article.get('bodyText', '')[:200] + "..." # 로그용으로 앞만 자름
                article_id = article.get('id')

                print(f"\n[{test_count + 1}/{target_count}] 기사 처리 중: {article_id}")
                print(f"   - 본문 미리보기: {body_text}")

                # 1. 토큰 분할 (Chunking)
                text_chunks = create_token_chunks(article.get('bodyText', ''), tokenizer, CHUNK_LENGTH, OVERLAP)
                print(f"   - 생성된 Chunk 개수: {len(text_chunks)}")

                if not text_chunks:
                    print("   - Chunk 생성 실패 (본문이 너무 짧거나 없음)")
                    continue

                # 2. 임베딩 생성
                chunk_embeddings = generate_embeddings(text_chunks, model, tokenizer, batch_size=BATCH_SIZE)

                # 3. 평균(Pooling)
                final_vector = mean_pool_embeddings(chunk_embeddings)

                # 결과 확인
                if final_vector is not None:
                    print(f"   - 임베딩 성공! 벡터 모양(Shape): {final_vector.shape}")
                    # 벡터의 앞부분 5개만 살짝 출력해서 숫자가 잘 찼는지 확인
                    print(f"   - 벡터 값 예시: {final_vector[:5]} ...")
                    test_count += 1
                else:
                    print("   - 임베딩 실패")

    except Exception as e:
        logging.error(f"테스트 중 오류 발생: {e}")
        break

print("\n" + "="*50)
if test_count > 0:
    print(f"테스트 완료! {test_count}개의 기사가 정상적으로 임베딩 되었습니다.")
    print("이제 이 코드는 지우고, 원래의 [Cell 4]를 실행해서 전체를 돌리셔도 됩니다.")
else:
    print("테스트 실패. 데이터를 읽지 못했거나 오류가 발생했습니다.")
print("="*50)

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors



 파일 열기: alex_morgan.jsonl

[1/5] 기사 처리 중: uk-news/2017/dec/31/new-years-eve-celebrations-to-go-ahead-despite-storm-dylan
   - 생성된 Chunk 개수: 2
   - 임베딩 성공! 벡터 모양(Shape): (1024,)
   - 벡터 값 예시: [-0.03895908  0.01207704  0.00474441 -0.01309364 -0.02077837] ...

[2/5] 기사 처리 중: sport/2017/dec/31/alastair-cook-david-warner-test-maestros-ashes-jason-gillespie
   - 본문 미리보기: Cricket is a sport of wonderful contrasts and during the Melbourne Test we saw the beauty of this in Alastair Cook and David Warner. Here are two champions who are performing the same job for their te...
   - 생성된 Chunk 개수: 3
   - 임베딩 성공! 벡터 모양(Shape): (1024,)
   - 벡터 값 예시: [ 0.03043569 -0.0053199  -0.01531123  0.00610584 -0.01735093] ...

[3/5] 기사 처리 중: world/2017/dec/31/eight-big-ideas-for-2018-politics-culture-space-science-food
   - 본문 미리보기: Art: Stefan Kalmar AN AGE OF CRISIS: WHAT A GREAT OPPORTUNITY… 2018 is all about reclaiming reality, opposing governmental and corporate paradoxes, and dissecting lies, before they b

In [None]:
# === [최종 Main Cell 4] 전체 데이터 실행 ===

logging.info("전체 데이터 임베딩 작업을 시작합니다.")

# 데이터 수집 및 embedding 생성 (인물별로 처리 및 즉시 저장)
for idx, file_path in enumerate(tqdm(jsonl_files, desc="Overall Progress")):
    person = extract_person_name(file_path)

    # 파일에서 기사 읽기
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            articles = [json.loads(line) for line in f]
    except Exception as e:
        logging.error(f"Failed to read {file_path}: {e}")
        continue

    # 현재 인물의 기사별 처리
    person_embeddings = []
    person_metadata = []

    for article in articles:
        body_text = article.get('bodyText', '')
        article_id = article.get('id')
        pub_date_raw = article.get('webPublicationDate')

        if not all([body_text, article_id, pub_date_raw]):
            continue # 필수 정보 누락

        # 1. 토큰 분할 (Chunking)
        text_chunks = create_token_chunks(body_text, tokenizer, CHUNK_LENGTH, OVERLAP)
        if not text_chunks:
            continue

        # 2. 임베딩 생성 (Batch 처리)
        chunk_embeddings = generate_embeddings(text_chunks, model, tokenizer, batch_size=BATCH_SIZE)

        # 3. 평균(Pooling)
        article_embedding = mean_pool_embeddings(chunk_embeddings)

        if article_embedding is not None:
            person_embeddings.append(article_embedding)
            person_metadata.append({
                'person': person,
                'article_id': article_id,
                'pub_date': parse_pub_date(pub_date_raw)
            })

    # --- 한 인물 처리가 끝나면 즉시 파일에 추가 (Append) ---
    if person_embeddings:
        person_embeddings_array = np.array(person_embeddings)

        # 기존 데이터와 합치기
        if existing_embeddings is not None:
            combined_embeddings = np.vstack([existing_embeddings, person_embeddings_array])
        else:
            combined_embeddings = person_embeddings_array

        combined_metadata = existing_metadata + person_metadata

        # 즉시 저장 (덮어쓰기)
        try:
            np.save(embeddings_file, combined_embeddings)
            with open(metadata_file, 'w', encoding='utf-8') as f:
                for meta in combined_metadata:
                    f.write(json.dumps(meta, ensure_ascii=False) + '\n')

            # 다음 반복을 위해 업데이트
            existing_embeddings = combined_embeddings
            existing_metadata = combined_metadata

        except Exception as e:
            logging.error(f"Failed to save data for {person}: {e}")
            continue

    # Checkpoint 업데이트 (저장 성공 시)
    processed_files.add(file_path.name)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({'processed_files': list(processed_files)}, f)

# --- 모든 작업 완료 ---
logging.info(f"\n All processing complete!")
if existing_embeddings is not None:
    logging.info(f"Final embeddings shape: {existing_embeddings.shape}")
    logging.info(f"Total articles: {len(existing_metadata)}")

# Checkpoint 삭제
if CHECKPOINT_FILE.exists():
    CHECKPOINT_FILE.unlink()
    logging.info("Checkpoint file removed.")

Overall Progress:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# 혹시 속도 안 나오면 Cell 2의 함수를 이걸로 교체해서 FP16 적용.

# @torch.no_grad()
# def generate_embeddings(texts, model, tokenizer, batch_size=32):
#     embeddings = []
#     for i in range(0, len(texts), batch_size):
#         batch = texts[i:i+batch_size]
#         encoded = tokenizer(batch, padding=True, truncation=True, 
#                             max_length=512, return_tensors='pt')
#         encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
        
#         # FP16 적용 (속도 향상 핵심)
#         with torch.cuda.amp.autocast():
#             outputs = model(**encoded)
#             batch_embeddings = outputs.last_hidden_state[:, 0, :]
#             batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
            
#         embeddings.append(batch_embeddings.cpu().numpy().astype(np.float32))
    
#     if not embeddings: return np.array([])
#     return np.vstack(embeddings)