In [2]:
import pandas as pd
import numpy as np
import requests
from owid.catalog import Dataset
from etl.paths import BASE_DIR, DAG_DIR, DATA_DIR

from typing import List, Dict, Any

## 0. What are the available country names?

## 1. Get all the tags for a country

In [23]:
get_tags_for_title("Singapore")

['first-stop-singapore/first-stop-singapore',
 'media-network/activate-singapore',
 'singapore-invites/singapore-invites',
 'travel/series/singapore-city-guide',
 'travel/singapore',
 'visit-singapore/visit-singapore',
 'weather/republicofsingapore',
 'weather/singapore',
 'world/singapore',
 'world/singapore-changi-airport',
 'your-singapore/your-singapore']

In [24]:
# Get all tags for a given title
def get_tags_for_title(title: str, verbose: bool = False):
    api_tags_url = "https://content.guardianapis.com/tags"
    LIM_PAGE_SIZE = 1000

    data = requests.get(api_tags_url, params={"api-key": API_KEY, "web-title": title, "page-size": LIM_PAGE_SIZE}).json()

    if "response" not in data:
        raise KeyError("No response!")
    if "results" not in data["response"]:
        raise KeyError("No results!")

    response = data["response"]
    results = response["results"]

    # Get num tags and tags (no contributor tags)
    num_tags = response["total"]
    tags = [t["id"] for t in results if t["type"] not in {"contributor"}]

    assert num_tags < LIM_PAGE_SIZE, "Too many tags!"

    ## Display
    tags_blist = '\n- '.join(tags)

    if verbose:
        print(f"There are {len(tags)} for country {country}:\n- {tags_blist}")

    return tags

def get_pages_from_tags(tags: List[str], lazy: bool = True):
    api_content_url = "https://content.guardianapis.com/search"

    # Get list of articles for each tag
    tag_or = "|".join(tags)
    params = params={"api-key": API_KEY, "tag": tag_or, "page-size": 200}
    data = requests.get(api_content_url, params=params).json()

    # Sanity check
    if "response" not in data:
        raise KeyError("No response!")

    response = data["response"]
    assert "results" in data["response"]


    # Number of pages with given tags
    num_pages = response["total"]

    # Get page IDs
    if not lazy:
        page_ids = _get_page_ids(api_content_url, params, response)
    else:
        page_ids = {}
    return num_pages, page_ids


def _get_page_ids(api_url: str, params: Dict[str, Any], response: Dict["str", Any]):
    # Sanity check
    assert "results" in response, "'results' not found in response"
    results = response["results"]

    # Initialise set of IDs
    page_ids = {r["id"] for r in results}

    # Get IDs for remaining pages
    if response["pages"] > 1:
        for page in range(2, response["pages"] + 1):
            data = requests.get(api_url, params=params | {"page": page}).json()
            assert "response" in data, "'response' missing"
            page_ids |= {r["id"] for r in data["response"]["results"]}
    return page_ids

In [10]:
# Example
tags = get_tags_for_title("North Macedonia")
num_pages = get_pages_from_tags(tags)

## Get list of country names

In [5]:
# Load regions table from disk
tb_regions = Dataset(DATA_DIR / "garden/regions/2023-01-01/regions")["regions"]
# Extract list with country names
tb_regions = tb_regions[~tb_regions["is_historical"] & (tb_regions["region_type"]=="country")]
tb = tb_regions.reset_index()
countries = sorted(set(tb.name))

In [35]:
def get_tags_for_countries(countries):
    tags_by_country = {}
    for i, country in enumerate(countries):
        if i % 10 == 0:
            print(country)
        try:
            tags = get_tags_for_title(country)
        except KeyError:
            print(f"> Error for {country}")
            tags_by_country[country] = "Error"
        else:
            tags_by_country[country] = tags
    return tags_by_country

tags_by_country = get_tags_for_countries(countries)

Afghanistan
Armenia
Belgium
Brazil
Cape Verde
Congo
Denmark
Estonia
French Southern Territories
Grenada
Honduras
Israel
Kuwait
Luxembourg
Mauritania
Morocco
Niger
Palau
Puerto Rico
Saint Martin (French part)
Sierra Leone
South Sudan
Tanzania
Tuvalu
Vatican


In [37]:
countries_err = [k for k, v in tags_by_country.items() if v == "Error"]
print(f"{len(countries_err)} countries with error!")

if len(countries_err) > 0:
    # Re-run for errors
    countries_err = [k for k, v in tags_by_country.items() if v == "Error"]
    for country in countries_err:
        print(country)
        try:
            tags = get_tags_for_title(country)
        except KeyError:
            print("> Error")
            tags_by_country[country] = "Error"

0 countries with error!


In [38]:
tags_by_country

{'Afghanistan': ['global-development/series/afghanistan-london-conference',
  'global-development/series/women-report-afghanistan',
  'sport/afghanistan-cricket-team',
  'travel/afghanistan',
  'weather/afghanistan',
  'world/afghanistan',
  'world/afghanistantimeline',
  'world/series/afghanistan-other-voices',
  'world/series/afghanistan-the-left-behind',
  'world/series/john-d-mchugh-afghanistan',
  'world/the-war-logs'],
 'Aland Islands': [],
 'Albania': ['football/albania',
  'travel/albania',
  'weather/albania',
  'world/albania'],
 'Algeria': ['football/algeria',
  'travel/algeria',
  'weather/algeria',
  'world/algeria',
  'world/algerian-hostage-crisis'],
 'American Samoa': ['travel/americansamoa'],
 'Andorra': ['football/andorra',
  'travel/andorra',
  'weather/andorra',
  'weather/andorraandorra',
  'world/andorra'],
 'Angola': ['football/angola',
  'travel/angola',
  'weather/angola',
  'world/angola'],
 'Anguilla': ['travel/anguilla'],
 'Antigua and Barbuda': ['travel/ant