In [None]:
import os
os.environ['GUARDIAN_API_KEY'] = "YOUR_API_KEY_HERE"

## 1. Setup & Config

In [None]:
import re, json, time, pathlib
from datetime import datetime, date
from typing import Dict, List, Tuple, Iterable
import requests
import pandas as pd

API_KEY = os.getenv("GUARDIAN_API_KEY", "").strip()
assert API_KEY, "Set GUARDIAN_API_KEY environment variable."

BASE_SEARCH = "https://content.guardianapis.com/search"
OUTPUT_COLS = [
    "id","webPublicationDate","headline","trailText","bodyText",
    "webTitle","webUrl","apiUrl","wordcount"
]

def slugify(s: str) -> str:
    """파일명으로 사용 가능한 slug 생성"""
    s = re.sub(r'[^0-9A-Za-z]+', '_', s.lower()).strip('_')
    return s or "query"

def year_slices(start_date: str, end_date: str) -> List[Tuple[int, str, str]]:
    """날짜 범위를 연도별로 분할"""
    sd = datetime.strptime(start_date, "%Y-%m-%d").date()
    ed = datetime.strptime(end_date, "%Y-%m-%d").date()
    assert sd <= ed, "start_date must be <= end_date"
    out = []
    y = sd.year
    while y <= ed.year:
        s = max(sd, date(y,1,1))
        e = min(ed, date(y,12,31))
        out.append((y, s.isoformat(), e.isoformat()))
        y += 1
    return out

def guardian_get(params: Dict, max_retries: int = 6) -> Dict:
    """Guardian API 호출 (에러 시 자동 재시도)"""
    p = dict(params)
    p["api-key"] = API_KEY
    sleep = 1.5
    for _ in range(max_retries):
        r = requests.get(BASE_SEARCH, params=p, timeout=30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 502, 503, 504):
            time.sleep(sleep)
            sleep *= 2
            continue
        r.raise_for_status()
    raise RuntimeError(f"Guardian API failed: {r.status_code} {r.text[:200]}")

## 2. Fetch & Save Functions


In [None]:
def iter_results(q: str, from_date: str, to_date: str,
                 query_fields=("headline","body"), page_size: int = 200) -> Iterable[Dict]:
    """모든 페이지의 검색 결과를 yield"""
    params = {
        "q": q.lower(),
        "from-date": from_date,
        "to-date": to_date,
        "page-size": page_size,
        "order-by": "newest",
        "use-date": "published",
        "query-fields": ",".join(query_fields),
        "show-fields": "headline,trailText,bodyText,thumbnail,wordcount",
        "show-tags": "all",
    }
    js = guardian_get(params)
    resp = js.get("response", {})
    pages = int(resp.get("pages", 0)) or 0
    
    for it in resp.get("results", []):
        yield it
    
    for p in range(2, pages + 1):
        params["page"] = p
        js = guardian_get(params)
        for it in js.get("response", {}).get("results", []):
            yield it

def to_row(it: Dict) -> Dict:
    """검색 결과를 출력 스키마로 변환"""
    f = it.get("fields") or {}
    return {
        "id": it.get("id"),
        "webPublicationDate": it.get("webPublicationDate"),
        "headline": f.get("headline"),
        "trailText": f.get("trailText"),
        "bodyText": f.get("bodyText"),
        "webTitle": it.get("webTitle"),
        "webUrl": it.get("webUrl"),
        "apiUrl": it.get("apiUrl"),
        "wordcount": f.get("wordcount"),
    }

def crawl_and_save(query: str, start_date: str, end_date: str,
                   out_dir: str = "guardian_raw_scraping",
                   query_fields=("headline","body")) -> None:
    """크롤링 후 JSONL, CSV 저장"""
    slug = slugify(query)
    base = pathlib.Path(out_dir)
    base.mkdir(parents=True, exist_ok=True)
    
    # 모든 연도의 결과 수집
    seen, rows = set(), []
    for y, y_start, y_end in year_slices(start_date, end_date):
        print(f"  [{query}] Crawling {y}: {y_start} ~ {y_end}")
        for it in iter_results(query, y_start, y_end, query_fields=query_fields):
            _id = it.get("id")
            if _id in seen:
                continue
            seen.add(_id)
            rows.append(to_row(it))

    # JSONL 저장
    jsonl_path = base / f"{slug}.jsonl"
    with jsonl_path.open("w", encoding="utf-8") as jf:
        for r in rows:
            jf.write(json.dumps(r, ensure_ascii=False) + "\n")

    print(f"  [{query}] Total rows={len(rows)}  JSONL={jsonl_path.name}") 

## 3. Batch Crawl & Summary

In [None]:

def batch_crawl_and_summary(
    people: List[str],
    start_date: str = "2017-01-01",
    end_date: str = "2019-12-31",
    out_dir: str = "guardian_scraping",
    query_fields=("headline","body"),
    skip_existing: bool = True,
) -> Dict[str, pathlib.Path]:
    """여러 인물에 대해 크롤링 후 summary CSV 생성"""
    out_base = pathlib.Path(out_dir)
    out_base.mkdir(parents=True, exist_ok=True)
    
    summary_data = []
    
    for i, p in enumerate(people, 1):
        slug = slugify(p)
        jsonl_path = out_base / f"{slug}.jsonl"
        
        # 기존 파일 있으면 스킵
        if skip_existing and jsonl_path.exists():
            print(f"\n[{i}/{len(people)}] SKIP (exists): {p}")
        else:
            print(f"\n[{i}/{len(people)}] Crawling: {p}")
            crawl_and_save(
                query=p,
                start_date=start_date,
                end_date=end_date,
                out_dir=out_dir,
                query_fields=query_fields,
            )
        
        # 연도별 기사 수 집계
        count_2017, count_2018, count_2019, total_count = 0, 0, 0, 0
        if jsonl_path.exists():  # csv_path → jsonl_path로 변경
            try:
                # JSONL 파일에서 직접 읽기
                rows = []
                with jsonl_path.open("r", encoding="utf-8") as f:
                    for line in f:
                        rows.append(json.loads(line))
                
                df = pd.DataFrame(rows)
                total_count = len(df)
                df['year'] = pd.to_datetime(df['webPublicationDate']).dt.year
                count_2017 = len(df[df['year'] == 2017])
                count_2018 = len(df[df['year'] == 2018])
                count_2019 = len(df[df['year'] == 2019])
            except Exception as e:
                print(f"  Warning: Could not read {jsonl_path.name}: {e}")
        
        summary_data.append({
            "person": p,
            "slug": slug,
            "2017": count_2017,
            "2018": count_2018,
            "2019": count_2019,
            "total": total_count,
        })

    # Summary CSV 저장
    df_summary = pd.DataFrame(summary_data)
    summary_path = "guardian_raw_scraping_summary.csv"
    df_summary.to_csv(summary_path, index=False, encoding="utf-8-sig")

    print(f"\n\nSaved summary: {summary_path}")
    print(f"Total people crawled: {len(people)}")
    print(f"Total articles: {df_summary['total'].sum()}")
    print(f"  - 2017: {df_summary['2017'].sum()}")
    print(f"  - 2018: {df_summary['2018'].sum()}")
    print(f"  - 2019: {df_summary['2019'].sum()}")

    return {"summary": summary_path}

## Cell 4: Run

In [None]:
# people_list.txt 파일 읽기
with open('people_list.txt', 'r', encoding='utf-8') as f:
    content = f.read()

people_list = content.split('\n')
people_list = [name.strip() for name in people_list if name.strip()]

# 중복 체크
duplicates = [name for name in set(people_list) if people_list.count(name) > 1]
people = list(set(people_list))

print(f"Total: {len(people_list)} names")
print(f"Unique: {len(people)} people")
print(f"Duplicates: {len(people_list) - len(people)}")
if duplicates:
    print(f"Duplicate names: {duplicates}")
print(f"\nCrawling list: {people}")
print()

#people = ["Samantha Bee","Constance Wu"]
# 크롤링 실행
paths = batch_crawl_and_summary(
    people=people,
    start_date="2017-01-01",
    end_date="2019-12-31",
    out_dir="../guardian_raw_scraping",
)

## 5: Extract Top 100 & Copy Files

In [None]:
import shutil

# guardian_raw_scraping_summary.csv 읽기
df = pd.read_csv('guardian_raw_scraping_summary.csv')

# total 기준 내림차순 정렬 후 top 100 추출
df_sorted = df.sort_values('total', ascending=False).head(100)
top100_people = df_sorted['person'].tolist()

print(f"Top 100 people by total articles: {len(top100_people)}")
print(f"Total articles: {df_sorted['total'].sum()}")
print(f"\nTop 10:")
print(df_sorted[['person', 'total']].head(10).to_string(index=False))

# people_top100_list.txt 저장
with open('people_top100_list.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(top100_people))
print(f"\nSaved: people_top100_list.txt")

# ../guardian_top100_scraping 폴더 생성
new_dir = '../guardian_top100_scraping'
os.makedirs(new_dir, exist_ok=True)

# jsonl 파일 복사
copied_count = 0
for person in top100_people:
    slug = df_sorted[df_sorted['person'] == person]['slug'].values[0]
    src_file = f'../guardian_raw_scraping/{slug}.jsonl'
    dst_file = f'{new_dir}/{slug}.jsonl'
    
    if os.path.exists(src_file):
        shutil.copy2(src_file, dst_file)
        copied_count += 1

print(f"\nCopied {copied_count} jsonl files to {new_dir}/")