In [None]:
import json
import os
from pathlib import Path
import pandas as pd
import numpy as np
from collections import defaultdict

def count_valid_fields(data):
    """각 field가 valid한지 확인하고 count한다"""
    counts = {
        'total': 0,
        'valid_headline': 0,
        'valid_webPublicationDate': 0,
        'valid_bodyText': 0
    }
    
    for item in data:
        counts['total'] += 1
        
        # headline이 존재하고 비어있지 않은지 확인
        if 'headline' in item and item['headline'] and isinstance(item['headline'], str) and item['headline'].strip():
            counts['valid_headline'] += 1
        
        # webPublicationDate가 존재하고 비어있지 않은지 확인
        if 'webPublicationDate' in item and item['webPublicationDate'] and isinstance(item['webPublicationDate'], str) and item['webPublicationDate'].strip():
            counts['valid_webPublicationDate'] += 1
        
        # bodyText가 존재하고 비어있지 않은지 확인
        if 'bodyText' in item and item['bodyText'] and isinstance(item['bodyText'], str) and item['bodyText'].strip():
            counts['valid_bodyText'] += 1
    
    return counts

def count_metadata_by_person(metadata_path):
    """metadata.jsonl에서 각 person별로 원소 개수를 count한다"""
    person_counts = defaultdict(int)
    
    if not os.path.exists(metadata_path):
        return person_counts
    
    with open(metadata_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                person = item.get('person', 'unknown')
                person_counts[person] += 1
    
    return dict(person_counts)

def count_embeddings(npy_path):
    """embeddings.npy 파일에서 embedding 개수를 count한다"""
    if not os.path.exists(npy_path):
        return 0
    
    try:
        embeddings = np.load(npy_path)
        return embeddings.shape[0]  # 첫 번째 dimension이 embedding 개수
    except Exception as e:
        print(f"Error loading {npy_path}: {e}")
        return 0

def main():
    # 결과를 저장할 list
    results = []
    
    # guardian_top100_scraping 폴더 처리
    scraping_dir = Path('guardian_top100_scraping')
    
    if scraping_dir.exists():
        for jsonl_file in sorted(scraping_dir.glob('*.jsonl')):
            person_name = jsonl_file.stem  # 확장자를 제외한 파일명
            
            # JSONL 파일 읽기
            data = []
            with open(jsonl_file, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        data.append(json.loads(line))
            
            # valid field count
            counts = count_valid_fields(data)
            
            results.append({
                'person': person_name,
                'scraping_total': counts['total'],
                'scraping_valid_headline': counts['valid_headline'],
                'scraping_valid_webPublicationDate': counts['valid_webPublicationDate'],
                'scraping_valid_bodyText': counts['valid_bodyText']
            })
    
    # vector_headlines/metadata.jsonl 처리
    headlines_counts = count_metadata_by_person('vector_headlines/metadata.jsonl')
    
    # vector_chunking/metadata.jsonl 처리
    chunking_counts = count_metadata_by_person('vector_chunking/metadata.jsonl')
    
    # embeddings.npy 개수 확인
    headlines_embeddings_count = count_embeddings('vector_headlines/embeddings.npy')
    chunking_embeddings_count = count_embeddings('vector_chunking/embeddings.npy')
    
    # results에 vector counts 추가
    # 모든 person name을 수집
    all_persons = set()
    for result in results:
        all_persons.add(result['person'])
    all_persons.update(headlines_counts.keys())
    all_persons.update(chunking_counts.keys())
    
    # results를 dict로 변환하여 쉽게 접근
    results_dict = {r['person']: r for r in results}
    
    # 최종 results list 생성
    final_results = []
    for person in sorted(all_persons):
        row = results_dict.get(person, {
            'person': person,
            'scraping_total': 0,
            'scraping_valid_headline': 0,
            'scraping_valid_webPublicationDate': 0,
            'scraping_valid_bodyText': 0
        })
        
        row['vector_headlines_count'] = headlines_counts.get(person, 0)
        row['vector_chunking_count'] = chunking_counts.get(person, 0)
        
        final_results.append(row)
    
    # DataFrame 생성 및 CSV 저장
    df = pd.DataFrame(final_results)
    
    # column 순서 정리
    column_order = [
        'person',
        'scraping_total',
        'scraping_valid_headline',
        'scraping_valid_webPublicationDate',
        'scraping_valid_bodyText',
        'vector_headlines_count',
        'vector_chunking_count'
    ]
    
    df = df[column_order]
    
    # CSV 파일로 저장
    output_file = 'data_summary.csv'
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    
    print(f"분석 완료! 결과가 '{output_file}'에 저장되었다.")
    print(f"\n총 {len(final_results)}명의 person data를 처리했다.")
    print(f"\nPreview:")
    print(df.head(10))
    
    # 통계 요약
    print(f"\n=== 통계 요약 ===")
    print(f"총 scraping articles: {df['scraping_total'].sum()}")
    print(f"총 vector_headlines entries: {df['vector_headlines_count'].sum()}")
    print(f"총 vector_chunking entries: {df['vector_chunking_count'].sum()}")
    
    # embeddings.npy 정보 출력
    print(f"\n=== Embeddings.npy 정보 ===")
    print(f"vector_headlines/embeddings.npy: {headlines_embeddings_count}개의 embeddings")
    print(f"vector_chunking/embeddings.npy: {chunking_embeddings_count}개의 embeddings")
    
    # 일관성 확인
    print(f"\n=== 일관성 확인 ===")
    headlines_metadata_total = df['vector_headlines_count'].sum()
    chunking_metadata_total = df['vector_chunking_count'].sum()
    
    print(f"vector_headlines - metadata entries vs embeddings: {headlines_metadata_total} vs {headlines_embeddings_count}", end="")
    if headlines_metadata_total == headlines_embeddings_count:
        print(" ✓ 일치")
    else:
        print(f" ✗ 불일치 (차이: {abs(headlines_metadata_total - headlines_embeddings_count)})")
    
    print(f"vector_chunking - metadata entries vs embeddings: {chunking_metadata_total} vs {chunking_embeddings_count}", end="")
    if chunking_metadata_total == chunking_embeddings_count:
        print(" ✓ 일치")
    else:
        print(f" ✗ 불일치 (차이: {abs(chunking_metadata_total - chunking_embeddings_count)})")

if __name__ == "__main__":
    main()