In [16]:
import os
# os.environ['GUARDIAN_API_KEY'] = "YOUR_API_KEY_HERE"
os.environ['GUARDIAN_API_KEY'] = 'cfc29433-1765-41ac-8726-14d5ce438b9d'

In [17]:
# =========================
# Cell 2) Setup · config · small utils
# =========================

import os, re, json, time, pathlib, unicodedata
from datetime import datetime, date
from typing import Dict, List, Tuple, Iterable, Optional
import requests
import pandas as pd

API_KEY = os.getenv("GUARDIAN_API_KEY", "").strip()
assert API_KEY, "Set GUARDIAN_API_KEY environment variable."

BASE_SEARCH = "https://content.guardianapis.com/search"
OUTPUT_COLS = [
    "id","webPublicationDate","headline","trailText","bodyText",
    "webTitle","webUrl","apiUrl","wordcount","tags_titles","tags_types"
]

def slugify(s: str) -> str:
    """Simple, filesystem-safe slug."""
    s = re.sub(r'[^0-9A-Za-z]+', '_', s.lower()).strip('_')
    return s or "query"

def year_slices(start_date: str, end_date: str) -> List[Tuple[int, str, str]]:
    """Split [start_date, end_date] into per-year [start, end] strings."""
    sd = datetime.strptime(start_date, "%Y-%m-%d").date()
    ed = datetime.strptime(end_date, "%Y-%m-%d").date()
    assert sd <= ed, "start_date must be <= end_date"
    out = []
    y = sd.year
    while y <= ed.year:
        s = max(sd, date(y,1,1))
        e = min(ed, date(y,12,31))
        out.append((y, s.isoformat(), e.isoformat()))
        y += 1
    return out

def guardian_get(params: Dict, max_retries: int = 6) -> Dict:
    """GET /search with backoff for transient errors."""
    p = dict(params); p["api-key"] = API_KEY
    sleep = 1.5
    for _ in range(max_retries):
        r = requests.get(BASE_SEARCH, params=p, timeout=30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 502, 503, 504):
            time.sleep(sleep); sleep *= 2; continue
        r.raise_for_status()
    raise RuntimeError(f"Guardian API failed: {r.status_code} {r.text[:200]}")

In [18]:
# =========================
# Cell 3) Fetch, normalize, and save per-year
# =========================

def iter_results(q: str, from_date: str, to_date: str,
                 query_fields=("headline","body"), page_size: int = 200) -> Iterable[Dict]:
    """Yield /search items over all pages."""
    params = {
        "q": q.lower(),
        "from-date": from_date,
        "to-date": to_date,
        "page-size": page_size,   # max 200
        "order-by": "newest",
        "use-date": "published",
        "query-fields": ",".join(query_fields),
        "show-fields": "headline,trailText,bodyText,thumbnail,wordcount",
        "show-tags": "all",
    }
    js = guardian_get(params)
    resp = js.get("response", {})
    pages = int(resp.get("pages", 0)) or 0
    for it in resp.get("results", []):
        yield it
    for p in range(2, pages + 1):
        params["page"] = p
        js = guardian_get(params)
        for it in js.get("response", {}).get("results", []):
            yield it

def to_row(it: Dict) -> Dict:
    """Map a search item into the required output schema."""
    f = it.get("fields") or {}
    tags = it.get("tags") or []
    return {
        "id": it.get("id"),
        "webPublicationDate": it.get("webPublicationDate"),
        "headline": f.get("headline"),
        "trailText": f.get("trailText"),
        "bodyText": f.get("bodyText"),
        "webTitle": it.get("webTitle"),
        "webUrl": it.get("webUrl"),
        "apiUrl": it.get("apiUrl"),
        "wordcount": f.get("wordcount"),
        "tags_titles": "|".join([t.get("webTitle") for t in tags if t.get("webTitle")]) or None,
        "tags_types":  "|".join([t.get("type") for t in tags if t.get("type")]) or None,
    }

def crawl_and_save(query: str, start_date: str, end_date: str,
                   out_dir: str = "guardian_dump",
                   query_fields=("headline","body"),
                   extra_aliases: Optional[List[str]] = None) -> None:
    """
    Orchestrates the whole pipeline:
      - builds boolean query
      - loops per-year
      - writes {slug}/{slug}_YEAR.jsonl and .csv with the fixed column order
    """
    slug = slugify(query)
    base = pathlib.Path(out_dir) / slug
    base.mkdir(parents=True, exist_ok=True)
    # boolean_q = build_boolean_query(query, extra_aliases)
    boolean_q = query # Just for test
    for y, y_start, y_end in year_slices(start_date, end_date):
        # print(f"[{query}] Starting {y} data crawling: {y_start} ~ {y_end}")
        seen, rows = set(), []
        for it in iter_results(boolean_q, y_start, y_end, query_fields=query_fields):
            _id = it.get("id")
            if _id in seen: 
                continue
            seen.add(_id)
            rows.append(to_row(it))

        # write JSONL
        jsonl_path = base / f"{slug}_{y}.jsonl"
        with jsonl_path.open("w", encoding="utf-8") as jf:
            for r in rows:
                jf.write(json.dumps(r, ensure_ascii=False) + "\n")

        # write CSV with exact column order
        df = pd.DataFrame(rows).reindex(columns=OUTPUT_COLS)
        csv_path = base / f"{slug}_{y}.csv"
        df.to_csv(csv_path, index=False, encoding="utf-8-sig")

        print(f"[{y}] rows={len(rows)}  JSONL={jsonl_path.name}  CSV={csv_path.name}")


In [19]:
# =========================
# Cell 3) Query augmentation with automatic accent handling (simplified)
# =========================

def remove_accents(text: str) -> str:
    """Remove accents from text (é->e, í->i, á->a, etc.)"""
    nfd = unicodedata.normalize('NFD', text)
    return ''.join(char for char in nfd if unicodedata.category(char) != 'Mn')

def _tokens(name: str) -> List[str]:
    name = re.sub(r"[_/]", " ", name)
    name = re.sub(r"[\"'""']", "", name)
    name = re.sub(r"\s+", " ", name).strip().lower()
    return name.split()

# def _variants(tokens: List[str]) -> List[str]:
#     # Query augmentation disabled - only use exact names
#     if not tokens: return []
#     t = [x.strip(".") for x in tokens if x]
#     base = {" ".join(t), "-".join(t), "_".join(t)}
#     if len(t) >= 3:
#         base.add(f"{t[0]} {t[-1]}")
#     # initials for all but last
#     initials = [tok[0] for tok in t[:-1] if tok and tok[0].isalpha()]
#     if initials and len(t) >= 2:
#         last = t[-1]
#         base.add(" ".join(initials + [last]))        # j j watt
#         base.add(".".join(initials) + f". {last}")   # j.j. watt
#         base.add(". ".join([i+"." for i in initials]) + f" {last}")  # j. j. watt
#     return sorted(base)

def build_boolean_query(query: str, extra_aliases: Optional[List[str]] = None) -> str:
    """Build query using only exact name + extra aliases + accent-free version"""
    query_lower = query.lower().strip()
    
    aliases = set([query_lower])  # Start with exact query
    
    # Add extra aliases if provided
    if extra_aliases: 
        aliases.update([a.lower().strip() for a in extra_aliases])
    
    # Auto-add accent-free version if original has accents
    if query != remove_accents(query):
        no_accent = remove_accents(query).lower().strip()
        aliases.add(no_accent)

    # Build OR query with all variants
    phrases = [f'"{p}"' for p in sorted(aliases)]
    return " OR ".join(phrases)

In [20]:
# =========================
# Cell 4) Fetch, normalize, and save (combined years)
# =========================

def iter_results(q: str, from_date: str, to_date: str,
                 query_fields=("headline","body"), page_size: int = 200) -> Iterable[Dict]:
    """Yield /search items over all pages."""
    params = {
        "q": q.lower(),
        "from-date": from_date,
        "to-date": to_date,
        "page-size": page_size,
        "order-by": "newest",
        "use-date": "published",
        "query-fields": ",".join(query_fields),
        "show-fields": "headline,trailText,bodyText,thumbnail,wordcount",
        "show-tags": "all",
    }
    js = guardian_get(params)
    resp = js.get("response", {})
    pages = int(resp.get("pages", 0)) or 0
    for it in resp.get("results", []):
        yield it
    for p in range(2, pages + 1):
        params["page"] = p
        js = guardian_get(params)
        for it in js.get("response", {}).get("results", []):
            yield it

def to_row(it: Dict) -> Dict:
    """Map a search item into the required output schema."""
    f = it.get("fields") or {}
    tags = it.get("tags") or []
    return {
        "id": it.get("id"),
        "webPublicationDate": it.get("webPublicationDate"),
        "headline": f.get("headline"),
        "trailText": f.get("trailText"),
        "bodyText": f.get("bodyText"),
        "webTitle": it.get("webTitle"),
        "webUrl": it.get("webUrl"),
        "apiUrl": it.get("apiUrl"),
        "wordcount": f.get("wordcount"),
        "tags_titles": "|".join([t.get("webTitle") for t in tags if t.get("webTitle")]) or None,
        "tags_types":  "|".join([t.get("type") for t in tags if t.get("type")]) or None,
    }

def crawl_and_save(query: str, start_date: str, end_date: str,
                   out_dir: str = "guardian_scrapping",
                   query_fields=("headline","body"),
                   extra_aliases: Optional[List[str]] = None) -> None:
    """
    Orchestrates the whole pipeline:
      - builds boolean query
      - loops through all dates
      - writes {person_name}.jsonl in guardian_scrapping/
      - writes {person_name}.csv in guardian_scrapping/scrapping_csv/
    """
    slug = slugify(query)
    base = pathlib.Path(out_dir)
    base.mkdir(parents=True, exist_ok=True)
    csv_dir = base / "scrapping_csv"
    csv_dir.mkdir(parents=True, exist_ok=True)
    
    # boolean_q = build_boolean_query(query, extra_aliases)
    boolean_q = query
    # Collect all results across all years
    seen, rows = set(), []
    for y, y_start, y_end in year_slices(start_date, end_date):
        print(f"  [{query}] Crawling {y}: {y_start} ~ {y_end}")
        for it in iter_results(boolean_q, y_start, y_end, query_fields=query_fields):
            _id = it.get("id")
            if _id in seen: continue
            seen.add(_id)
            rows.append(to_row(it))

    # write JSONL (in guardian_scrapping/)
    jsonl_path = base / f"{slug}.jsonl"
    with jsonl_path.open("w", encoding="utf-8") as jf:
        for r in rows:
            jf.write(json.dumps(r, ensure_ascii=False) + "\n")

    # write CSV (in guardian_scrapping/scrapping_csv/)
    df = pd.DataFrame(rows).reindex(columns=OUTPUT_COLS)
    csv_path = csv_dir / f"{slug}.csv"
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")

    print(f"  [{query}] Total rows={len(rows)}  JSONL={jsonl_path.name}  CSV={csv_path.name}")

In [21]:
# =========================
# Cell 5) Batch runner + summary CSV (with skip existing files + yearly counts)
# =========================

def batch_crawl_and_summary(
    people: List[str],
    start_date: str = "2017-01-01",
    end_date: str = "2019-12-31",
    out_dir: str = "guardian_scrapping",
    query_fields=("headline","body"),
    aliases_map: Optional[Dict[str, List[str]]] = None,
    skip_existing: bool = True,
) -> Dict[str, pathlib.Path]:
    """Crawl Guardian per person, save combined JSONL/CSV, then build summary CSV."""
    out_base = pathlib.Path(out_dir)
    out_base.mkdir(parents=True, exist_ok=True)
    csv_dir = out_base / "scrapping_csv"
    csv_dir.mkdir(parents=True, exist_ok=True)
    
    if aliases_map is None:
        aliases_map = {}

    summary_data = []
    for i, p in enumerate(people, 1):
        slug = slugify(p)
        jsonl_path = out_base / f"{slug}.jsonl"
        csv_path = csv_dir / f"{slug}.csv"
        
        # Skip if JSONL file already exists
        if skip_existing and jsonl_path.exists():
            print(f"\n[{i}/{len(people)}] SKIP (exists): {p}")
        else:
            print(f"\n[{i}/{len(people)}] Crawling: {p}")
            crawl_and_save(
                query=p,
                start_date=start_date,
                end_date=end_date,
                out_dir=out_dir,
                query_fields=query_fields,
                extra_aliases=aliases_map.get(p, None),
            )
        
        # Count articles by year
        count_2017, count_2018, count_2019, total_count = 0, 0, 0, 0
        if csv_path.exists():
            try:
                df = pd.read_csv(csv_path)
                total_count = len(df)
                # Extract year from webPublicationDate
                df['year'] = pd.to_datetime(df['webPublicationDate']).dt.year
                count_2017 = len(df[df['year'] == 2017])
                count_2018 = len(df[df['year'] == 2018])
                count_2019 = len(df[df['year'] == 2019])
            except Exception as e:
                print(f"  Warning: Could not read {csv_path.name}: {e}")
        
        summary_data.append({
            "person": p,
            "slug": slug,
            "2017": count_2017,
            "2018": count_2018,
            "2019": count_2019,
            "total": total_count,
        })

    # Save summary
    df_summary = pd.DataFrame(summary_data)
    summary_path = "summary_counts.csv"
    df_summary.to_csv(summary_path, index=False, encoding="utf-8-sig")

    print(f"\n\nSaved summary: {summary_path}")
    print(f"Total people crawled: {len(people)}")
    print(f"Total articles: {df_summary['total'].sum()}")
    print(f"  - 2017: {df_summary['2017'].sum()}")
    print(f"  - 2018: {df_summary['2018'].sum()}")
    print(f"  - 2019: {df_summary['2019'].sum()}")

    return {"summary": summary_path}

In [None]:
# ===== Run =====
people = [
    "Tiffany Haddish", "Cameron Kasky", "Jaclyn Corin", "David Hogg", "Emma Gonzalez",
    "Alex Wind", "Kumail Nanjiani", "Cardi B", "Nice Nailantei Lengete", "Chloe Kim",
    "Carl June", "Jan Rader", "Peggy Whitson", "Issa Rae", "Bhavish Aggarwal",
    "Jesmyn Ward", "Ruth Davidson", "Whitney Wolfe Herd", "Marica Branchesi", "Ann McKee",
    "Trevor Noah", "Jian Wei Pan", "Nicole Kidman", "Hugh Jackman", "Gal Gadot",
    "Ryan Coogler", "Sterling Brown", "Millie Bobby Brown", "Kehinde Wiley", "Christian Siriano",
    "Lena Waithe", "Greta Gerwig", "Roseanne Barr", "Shawn Mendes", "Guillermo del Toro",
    "Deepika Padukone", "JR artist", "Jimmy Kimmel", "Judy Chicago", "John Krasinski",
    "Satya Nadella", "Donald Trump", "Prince Harry", "Meghan Markle", "Carmen Yulin Cruz",
    "Mohammed bin Salman", "Sadiq Khan", "Justin Trudeau", "Xi Jinping", "Sean Hannity",
    "Justin James Watt", "Robert Mueller", "Kenneth Frazier", "Nancy Pelosi", "Kim Jong Un",
    "Leo Varadkar", "Emmerson Mnangagwa", "Jacinda Ardern", "Savannah Guthrie", "Hoda Kotb",
    "Shinzo Abe", "Sheikh Hasina", "Jeff Sessions", "Moon Jae in", "Emmanuel Macron",
    "Mauricio Macri", "Scott Pruitt", "Haider al Abadi", "Jennifer Lopez", "Chadwick Boseman",
    "Rihanna", "Adam Rippon", "Tarana Burke", "Cristina Jimenez", "Janet Mock",
    "Kesha", "Kevin Kwan", "Ronan Farrow", "Jodi Kantor", "Megan Twohey",
    "Maxine Waters", "Sinta Nuriyah", "Rachael Denhollander", "Daniela Vega", "Virgil Abloh",
    "Christopher Wylie", "Roger Federer", "Oprah Winfrey", "Jeff Bezos", "Cindy Holland",
    "Kevin Durant", "Elon Musk", "Sonia Friedman", "Giuliano Testa", "Masayoshi Son",
    "Elizabeth Diller", "Virat Kohli", "Adam Neumann", "Pony Ma", "Jose Andres",
]
'''
aliases_map = {
    "Emma Gonzalez": ["Emma González"],
    "Nice Nailantei Lengete": ["Nice Nailantei Leng'ete"],
    "Sterling Brown": ["Sterling K. Brown"],
    "Carmen Yulin Cruz": ["Carmen Yulín Cruz"],
    "Justin James Watt": ["J.J. Watt"],
    "Kenneth Frazier": ["Kenneth C. Frazier"],
    "Cristina Jimenez": ["Cristina Jiménez"],
    "Jose Andres": ["José Andrés"],
}
'''

paths = batch_crawl_and_summary(
    people=people,
    start_date="2017-01-01",
    end_date="2019-12-31",
    out_dir="guardian_scrapping",
    aliases_map=None,
)



[1/100] SKIP (exists): Tiffany Haddish

[2/100] SKIP (exists): Cameron Kasky

[3/100] SKIP (exists): Jaclyn Corin

[4/100] SKIP (exists): David Hogg

[5/100] SKIP (exists): Emma Gonzalez

[6/100] SKIP (exists): Alex Wind

[7/100] SKIP (exists): Kumail Nanjiani

[8/100] SKIP (exists): Cardi B

[9/100] SKIP (exists): Nice Nailantei Lengete

[10/100] SKIP (exists): Chloe Kim

[11/100] SKIP (exists): Carl June

[12/100] SKIP (exists): Jan Rader

[13/100] SKIP (exists): Peggy Whitson

[14/100] SKIP (exists): Issa Rae

[15/100] SKIP (exists): Bhavish Aggarwal

[16/100] SKIP (exists): Jesmyn Ward

[17/100] SKIP (exists): Ruth Davidson

[18/100] SKIP (exists): Whitney Wolfe Herd

[19/100] SKIP (exists): Marica Branchesi

[20/100] SKIP (exists): Ann McKee

[21/100] SKIP (exists): Trevor Noah

[22/100] SKIP (exists): Jian Wei Pan

[23/100] SKIP (exists): Nicole Kidman

[24/100] SKIP (exists): Hugh Jackman

[25/100] SKIP (exists): Gal Gadot

[26/100] SKIP (exists): Ryan Coogler

[27/100] SKIP (e

Unnamed: 0,person,slug,2017,2018,2019,total
0,Tiffany Haddish,tiffany_haddish,152,153,178,483
1,Cameron Kasky,cameron_kasky,55,59,55,169
2,Jaclyn Corin,jaclyn_corin,9,30,24,63
3,David Hogg,david_hogg,890,755,629,2274
4,Emma Gonzalez,emma_gonzalez,1711,1664,1857,5232
...,...,...,...,...,...,...
95,Elizabeth Diller,elizabeth_diller,1612,1524,2318,5454
96,Virat Kohli,virat_kohli,149,246,198,593
97,Adam Neumann,adam_neumann,124,111,150,385
98,Pony Ma,pony_ma,639,508,477,1624
