In [3]:
import os
import pandas as pd
import requests
import json
import time
import dateutil
import datetime
from dateutil.relativedelta import relativedelta
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 📦 Install Google Chrome
!wget -q -O chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i chrome.deb || apt-get -fy install > /dev/null

# 📥 Get matching ChromeDriver
import os, re, requests, zipfile, io

# Get Chrome version
version_output = !google-chrome --version
major_version = re.search(r'(\d+)\.', version_output[0]).group(1)

# Get matching ChromeDriver URL
resp = requests.get('https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json')
driver_info = resp.json()['channels']['Stable']['downloads']['chromedriver']
driver_url = next(d['url'] for d in driver_info if d['platform'] == 'linux64')

# Download and extract ChromeDriver
driver_zip = requests.get(driver_url)
z = zipfile.ZipFile(io.BytesIO(driver_zip.content))
extract_path = "/usr/bin/chromedriver-linux64"
z.extractall(extract_path)
os.chmod(f"{extract_path}/chromedriver", 0o755)

# 🔄 Fix symlink safely
if os.path.islink("/usr/bin/chromedriver") or os.path.exists("/usr/bin/chromedriver"):
    os.remove("/usr/bin/chromedriver")
os.symlink(f"{extract_path}/chromedriver", "/usr/bin/chromedriver")

# ✅ Install Python packages
!pip install -q selenium beautifulsoup4

# ✅ Test it works
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=chrome_options)
driver.get("https://example.com")
print("✅ Success! Page title:", driver.title)
driver.quit()

(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 126897 files and directories currently installed.)
Preparing to unpack chrome.deb ...
Unpacking google-chrome-stable (138.0.7204.183-1) over (138.0.7204.183-1) ...
Setting up google-chrome-stable (138.0.7204.183-1) ...
Processing triggers for mailcap (3.70+nmu1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...
✅ Success! Page title: Example Domain


In [4]:
from google.colab import userdata
API_KEY = userdata.get('NY_TIMES_KEY')

In [None]:
def send_request(date):
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + API_KEY
    try:
        response = requests.get(url, verify=False).json()
    except Exception:
        return None
    time.sleep(6)
    return response


def is_valid(article, date):
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    data = {'headline': [],
        'date': [],
        'web_url': [],
        'doc_type': [],
        'lead_paragraph': [],
        'material_type': [],
        'author': [],
        'section': [],
        'subsection': [],
        'keywords': []}

    articles = response['response']['docs']
    for article in articles:
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main'])
            if 'section_name' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            if 'lead_paragraph' in article:
                data['lead_paragraph'].append(article['lead_paragraph'])
            else:
                data['lead_paragraph'].append(None)
            if 'web_url' in article:
                data['web_url'].append(article['web_url'])
            else:
                data['web_url'].append(None)
            if 'subsection_name' in article:
                data['subsection'].append(article['subsection_name'])
            else:
                data['subsection'].append(None)
            if 'byline' in article:
                data['author'].append(article['byline']['original'])
            else:
                data['author'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article:
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data)


def get_data(dates):
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        print('Working on ' + str(date) + '...')
        csv_path = 'headlines/' + date[0] + '-' + date[1] + '.csv'
        if not os.path.exists(csv_path): # If we don't already have this month
            response = send_request(date)
            if response is not None:
                df = parse_response(response)
                total += len(df)
                df.to_csv(csv_path, index=False)
                print('Saving ' + csv_path + '...')
    print('Number of articles collected: ' + str(total))

In [None]:
end = datetime.date(2020, 12, 31)
start = datetime.date(2000, 1, 1)

In [None]:
months = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [None]:
get_data(months)

In [None]:
import os
import glob
import pandas as pd
os.chdir("/content/headlines") ## use Google Colab

In [None]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [None]:
#combine in a single file
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')