In [1]:
import pandas as pd
from datetime import datetime, timezone
import os
import requests
import time
import random
from urllib.parse import quote

In [2]:
def convert_datetime(pub_date_str):
    # Step 1: Parse the datetime string
    pub_date = datetime.strptime(pub_date_str, "%Y-%m-%dT%H:%M:%SZ")

    # Step 2: Convert to UTC (if not already in UTC)
    pub_date_utc = pub_date.replace(tzinfo=timezone.utc)

    # Step 3: Format to only include the date
    pub_date_utc_only_date = datetime.strftime(pub_date_utc, "%Y-%m-%d")
    return pub_date_utc_only_date

In [3]:
def fetch_guardian_news_for_commodities(subjects, start_date, end_date, api_key):
    news_data = []
    titles_set = set()  # Set to keep track of titles
    for subject in subjects:
        base_url = "https://content.guardianapis.com/search?"
        page = 1
        page_size = 1
        start_date = pd.to_datetime(start_date).strftime("%Y-%m-%d")
        end_date = pd.to_datetime(end_date).strftime("%Y-%m-%d")
        while page <= page_size:
            time.sleep(random.uniform(1, 3))
            response = requests.get(base_url + f'page={page}&q={quote(subject)}&from-date={start_date}&api-key={api_key}&order-by=oldest')
            if response.status_code != 200:
                print(f"Error: {response.status_code}")
                break  # Exit if there's an error
        
            data = response.json()
            
            # Check if there are no results
            if not data['response']['results']:
                print("No more results found, exiting loop.")
                break  # Break out of the loop if no results are found
            
            page_size = data['response']['pages']
            print(f"Page size: {page_size}")
            
            for article in data['response']['results']:
                title = article['webTitle']
                pub_date = article['webPublicationDate']
                pub_date = convert_datetime(pub_date)
                if start_date <= pub_date <= end_date:
                    print(pub_date)
                    if title not in titles_set:
                        news_data.append({
                            'title': title,
                            'published_date': pub_date
                        })
                        titles_set.add(title)  # Add title to the set
                    else:
                        continue
                else:
                    page_size = page
                    print('Breaking out of loop for this subject...')
                    break  # Exit the loop if the article is outside the date range

            page += 1
    
    # Sort and save only if there's news data
    if news_data:
        sorted_data = sorted(news_data, key=lambda x: x['published_date'])
        df = pd.DataFrame(sorted_data)
        
        # Save DataFrame to a CSV file
        csv_filename = f'../data/news/{subjects[0]}.csv'
        df.to_csv(csv_filename, index=False)
        print(f"Data saved to {csv_filename}")
    else:
        print("No news data to save.")

    return news_data


In [4]:
# insert api key
api_key = ''
start_date = '2015-05-08 00:00:00+00:00'
end_date = '2021-06-29 00:00:00+00:00'

# change accordingly based on which commodities to retrieve news for
commods = ["Wheat"]
fetch_guardian_news_for_commodities(commods, start_date, end_date, api_key)

Page size: 328
2015-05-08
2015-05-10
2015-05-11
2015-05-12
2015-05-12
2015-05-13
2015-05-14
2015-05-17
2015-05-18
2015-05-19
Page size: 328
2015-05-19
2015-05-20
2015-05-20
2015-05-22
2015-05-22
2015-05-22
2015-05-24
2015-05-25
2015-05-26
2015-05-26
Page size: 328
2015-05-26
2015-05-29
2015-05-29
2015-05-30
2015-06-02
2015-06-04
2015-06-04
2015-06-05
2015-06-06
2015-06-06
Page size: 328
2015-06-07
2015-06-07
2015-06-09
2015-06-09
2015-06-09
2015-06-10
2015-06-11
2015-06-11
2015-06-12
2015-06-12
Page size: 328
2015-06-16
2015-06-17
2015-06-18
2015-06-19
2015-06-23
2015-06-24
2015-06-24
2015-06-25
2015-06-25
2015-06-25
Page size: 328
2015-06-26
2015-06-26
2015-06-27
2015-06-27
2015-07-01
2015-07-03
2015-07-03
2015-07-05
2015-07-05
2015-07-06
Error: 429
Data saved to ../data/news/Wheat.csv


[{'title': 'Lebanese leave-taking: Greg Malouf’s last meal',
  'published_date': '2015-05-08'},
 {'title': 'Media Monkey’s Election Diary: Kay Burley, Jeremy Paxman and Andrew Neil',
  'published_date': '2015-05-10'},
 {'title': 'Home business innovation 2014: Designed2Eat',
  'published_date': '2015-05-11'},
 {'title': "Indian land bill: 'We’re losing not just land, but a whole generation of farmers'",
  'published_date': '2015-05-12'},
 {'title': 'The Fiver | Totally irrelevant perambulations involving, perhaps, Nana Mouskouri',
  'published_date': '2015-05-12'},
 {'title': 'Lukewarmers – the third stage of climate denial, gambling on snake eyes | Dana Nuccitelli',
  'published_date': '2015-05-13'},
 {'title': 'Supermarket lagers: the best and worst – taste test',
  'published_date': '2015-05-14'},
 {'title': 'Peter Kennard: Unofficial War Artist review – the king of political montage',
  'published_date': '2015-05-17'},
 {'title': "You had me at 'vagina rejuvenation': my weird day o