# Setup

In [8]:
# Navigate file system
import os
import json

# Access API secrets in .env file
%load_ext dotenv
%dotenv

# Accessing API
import requests

# Datetime utilities
import time
from datetime import date, timedelta
import dateutil
from dateutil.relativedelta import relativedelta

# Dataset exploration
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Output formatting
from pprint import pprint

# Notebook settings
import warnings
warnings.filterwarnings('ignore')

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [9]:

GUARDIAN_API_KEY=os.getenv('GUARDIAN_API_KEY')
API_ENDPOINT = 'https://content.guardianapis.com/search'
ARTICLES_DIR = os.path.join('../data/raw/', 'guardian')
os.makedirs(ARTICLES_DIR, exist_ok=True)

In [15]:
params = {
    'from-date': '',
    'to-date': '',
    'show-fields': 'all',
    'show-tags': 'all',
    'page-size': '50',
    'api-key': GUARDIAN_API_KEY
}

In [16]:
start_date = date(2021, 9, 27)
end_date = date(2022, 9, 27)

In [17]:
date_ranges = []

def create_date_ranges(start_date, end_date):
    num_of_months = (end_date.year - start_date.year) * 12 +  (end_date.month - start_date.month)
    print(f"Number of months: {num_of_months}")
    for month in range(1, num_of_months):
        new_end_date = start_date + relativedelta(months=1) - timedelta(days=1)
        date_ranges.append((start_date.strftime('%Y-%m-%d'), new_end_date.strftime('%Y-%m-%d')))
        start_date = start_date + relativedelta(months=1)
    last_month_start = new_end_date + relativedelta(days=1)
    last_month_end = end_date
    date_ranges.append((last_month_start.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')))
    print(f"Last months in date range: {last_month_start}, {last_month_end}")

In [18]:
create_date_ranges(start_date=start_date, end_date=end_date)

Number of months: 12
Last months in date range: 2022-08-27, 2022-09-27


In [19]:
date_ranges

[('2021-09-27', '2021-10-26'),
 ('2021-10-27', '2021-11-26'),
 ('2021-11-27', '2021-12-26'),
 ('2021-12-27', '2022-01-26'),
 ('2022-01-27', '2022-02-26'),
 ('2022-02-27', '2022-03-26'),
 ('2022-03-27', '2022-04-26'),
 ('2022-04-27', '2022-05-26'),
 ('2022-05-27', '2022-06-26'),
 ('2022-06-27', '2022-07-26'),
 ('2022-07-27', '2022-08-26'),
 ('2022-08-27', '2022-09-27')]

In [20]:
test = requests.get(f"https://content.guardianapis.com/search?&from-date=2021-09-27&to-date=2022-09-27&show-fields=all&show-tags=all&page-size=50&api-key={GUARDIAN_API_KEY}").json()['response']

In [22]:
print('Total items in date range: ',test['total'])
print('Current page: ',test['currentPage'])
print('Page size: ',test['pageSize'])
print('Number of pages: ',test['pages'])

Total items in date range:  78849
Current page:  1
Page size:  50
Number of pages:  1577


In [26]:
test['results'][0]

{'id': 'australia-news/2022/sep/28/high-risk-of-dam-burst-in-south-australian-town-of-echunga',
 'type': 'article',
 'sectionId': 'australia-news',
 'sectionName': 'Australia news',
 'webPublicationDate': '2022-09-27T23:59:52Z',
 'webTitle': '‘High risk’ of dam burst in South Australian town of Echunga',
 'webUrl': 'https://www.theguardian.com/australia-news/2022/sep/28/high-risk-of-dam-burst-in-south-australian-town-of-echunga',
 'apiUrl': 'https://content.guardianapis.com/australia-news/2022/sep/28/high-risk-of-dam-burst-in-south-australian-town-of-echunga',
 'fields': {'headline': '‘High risk’ of dam burst in South Australian town of Echunga',
  'byline': 'Natasha May',
  'wordcount': '687',
  'firstPublicationDate': '2022-09-27T23:59:52Z',
  'isInappropriateForSponsorship': 'false',
  'isPremoderated': 'false',
  'lastModified': '2022-09-28T03:47:52Z',
  'productionOffice': 'AUS',
  'publication': 'theguardian.com',
  'shortUrl': 'https://www.theguardian.com/p/mbep3',
  'shouldHide

In [192]:

for dates in date_ranges:
    # set date parameters for request    
    params['from-date'] = dates[0]
    params['to-date'] = dates[1]
    # create filename with current date range
    file_name = os.path.join(ARTICLES_DIR, f"ga_{params['from-date']}_{params['to-date']}.json")
    # ensure that filename doesn't exist already
    if not os.path.exists(file_name):
        print(f"Downloading {dates}")
        all_results = []
        current_page = 1
        total_pages = 1

        while current_page <= total_pages:
            print(f"...page {current_page}")
            # update current page parameter
            params['page'] = current_page
            # get number of total pages from response and update parameters
            res = requests.get(API_ENDPOINT, params)
            data = res.json()['response']
            total_pages = data['pages']
            print('Total items in date range: ',data['total'])
            #print('Current page: ',data['currentPage'])
            #print('Page size: ',data['pageSize'])
            #print('Number of pages: ',data['pages'])
            # get results and add them to results list
            all_results.extend(data['results'])
            # update page counter!
            current_page += 1

        # export results to file
        with open(file_name, 'w') as f:
            print(f"Writing to ... {file_name}")
            f.write(json.dumps(all_results, indent=2))


Downloading ('2021-09-27', '2021-10-26')
...page 1
Total items in date range:  6456
...page 2
Total items in date range:  6456
...page 3
Total items in date range:  6456
...page 4
Total items in date range:  6456
...page 5
Total items in date range:  6456
...page 6
Total items in date range:  6456
...page 7
Total items in date range:  6456
...page 8
Total items in date range:  6456
...page 9
Total items in date range:  6456
...page 10
Total items in date range:  6456
...page 11
Total items in date range:  6456
...page 12
Total items in date range:  6456
...page 13
Total items in date range:  6456
...page 14
Total items in date range:  6456
...page 15
Total items in date range:  6456
...page 16
Total items in date range:  6456
...page 17
Total items in date range:  6456
...page 18
Total items in date range:  6456
...page 19
Total items in date range:  6456
...page 20
Total items in date range:  6456
...page 21
Total items in date range:  6456
...page 22
Total items in date range:  6456
