# Setup

In [24]:
# Navigate file system
import os

# Access API secrets in .env file
%load_ext dotenv
%dotenv

# Accessing API
import requests

# Datetime utilities
import time
import datetime
import dateutil
from dateutil.relativedelta import relativedelta

# Dataset exploration
import pandas as pd

# Output formatting
from pprint import pprint

# Notebook settings
import warnings
warnings.filterwarnings('ignore')

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


# Archive API

In [2]:
# Setup constants to access API
BASE_URL = 'https://api.nytimes.com/svc/'
API_KEY = os.getenv('NYT_API_KEY')

In [3]:
year = 2022
month = 5

response = requests.get(BASE_URL + f"archive/v1/{year}/{month}.json?api-key={API_KEY}")
data_archive = response.json()

In [23]:
# Number of items returned
len(data_archive['response']['docs'])

4179

In [4]:
# Inspect keys in json
data_archive['response']['docs'][0].keys()

dict_keys(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri'])

In [5]:
pprint(data_archive['response']['docs'][0])

{'_id': 'nyt://article/465764cc-8719-5012-bbd2-1d122689e24a',
 'abstract': 'A police chief said that victims’ bodies were discovered in a '
             'forest, and showed signs of mutilation.',
 'byline': {'organization': None,
            'original': 'By Esha Ray',
            'person': [{'firstname': 'Esha',
                        'lastname': 'Ray',
                        'middlename': None,
                        'organization': '',
                        'qualifier': None,
                        'rank': 1,
                        'role': 'reported',
                        'title': None}]},
 'document_type': 'article',
 'headline': {'content_kicker': None,
              'kicker': None,
              'main': 'Ukrainian officials said they found the bodies of three '
                      'men in Bucha, with signs they had been ‘tortured.’',
              'name': None,
              'print_headline': '',
              'seo': None,
              'sub': None},
 'keywords': [{'ma

In [29]:
# Create date range
end = datetime.date.today()
start = end - relativedelta(years=1)

months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [30]:
months_in_range

[['2021', '10'],
 ['2021', '11'],
 ['2021', '12'],
 ['2022', '1'],
 ['2022', '2'],
 ['2022', '3'],
 ['2022', '4'],
 ['2022', '5'],
 ['2022', '6'],
 ['2022', '7'],
 ['2022', '8'],
 ['2022', '9']]

In [41]:

def send_request(date):
    response = requests.get(BASE_URL + f"archive/v1/{date[0]}/{date[1]}.json?api-key={API_KEY}").json()
    time.sleep(6)
    return response

def is_valid(article, date):
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline

def parse_response(response):
    data = {
        'headline': [],
        'date': [],
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': [],
        'word_count': []
    }

    articles = response['response']['docs']
    for article in articles:
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main'])
            data['section'].append(article['section_name'])
            
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article:
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
            data['word_count'].append(article['word_count'])
    return pd.DataFrame(data)

def get_data(dates):
    total = 0
    print(f"Date range: {str(dates[0])} to {str(dates[-1])}")
    if not os.path.exists('../data/raw/headlines'):
        os.mkdir('../data/raw/headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv(f"../data/raw/headlines/{date[0]}-{date[1]}.csv", index=False)
        print(f"Saving headlines/headlines/{date[0]}-{date[1]}.csv")
    print('Number of articles collected: ', total)

In [42]:
get_data(months_in_range)

Date range: ['2021', '10'] to ['2022', '9']
Saving headlines/headlines/2021-10.csv
Saving headlines/headlines/2021-11.csv
Saving headlines/headlines/2021-12.csv
Saving headlines/headlines/2022-1.csv
Saving headlines/headlines/2022-2.csv
Saving headlines/headlines/2022-3.csv
Saving headlines/headlines/2022-4.csv
Saving headlines/headlines/2022-5.csv
Saving headlines/headlines/2022-6.csv
Saving headlines/headlines/2022-7.csv
Saving headlines/headlines/2022-8.csv
Saving headlines/headlines/2022-9.csv
Number of articles collected:  48126


In [43]:
import glob

files = glob.glob('../data/raw/headlines/*.csv')

In [44]:
files

['../data/raw/headlines/2022-7.csv',
 '../data/raw/headlines/2022-6.csv',
 '../data/raw/headlines/2022-4.csv',
 '../data/raw/headlines/2022-5.csv',
 '../data/raw/headlines/2022-1.csv',
 '../data/raw/headlines/2022-2.csv',
 '../data/raw/headlines/2022-3.csv',
 '../data/raw/headlines/2021-12.csv',
 '../data/raw/headlines/2021-11.csv',
 '../data/raw/headlines/2021-10.csv',
 '../data/raw/headlines/2022-8.csv',
 '../data/raw/headlines/2022-9.csv']

In [45]:
nyt_df = pd.DataFrame()
for f in files:
    csv = pd.read_csv(f)
    nyt_df = nyt_df.append(csv)

In [46]:
nyt_df

Unnamed: 0,headline,date,doc_type,material_type,section,keywords,word_count
0,How to Dispute Surprise Medical Bills,2022-07-01,article,News,Well,"['Content Type: Service', 'Emergency Medical T...",1620
1,One Dead and 22 Have Been Hospitalized in List...,2022-07-01,article,News,Science,"['Food Contamination and Poisoning', 'Listerio...",366
2,California Wildfire Burns More Than 900 Acres ...,2022-07-01,article,News,U.S.,"['Wildfires', 'Fires and Firefighters', 'Evacu...",525
3,Columbia Won’t Participate in the Next U.S. Ne...,2022-07-01,article,News,U.S.,"['Colleges and Universities', 'Falsification o...",780
4,"Nine People Are Injured in Newark Shooting, Of...",2022-07-01,article,News,New York,['Mass Shootings'],214
...,...,...,...,...,...,...,...
3455,Trump White House Called Capitol Rioter on Jan...,2022-09-26,article,News,U.S.,"['United States Politics and Government', 'Sto...",723
3456,Trump’s Heartless QAnon Embrace,2022-09-26,article,Op-Ed,Opinion,"['Right-Wing Extremism and Alt-Right', 'QAnon'...",891
3457,"NASA Smashes Into an Asteroid, Completing a Mi...",2022-09-26,article,News,Science,"['Asteroids', 'Rocket Science and Propulsion',...",1298
3458,Hurricane Ian’s Uncertain Path Keeps Much of F...,2022-09-26,article,News,U.S.,"['Hurricane Ian (2022)', 'Hurricanes and Tropi...",1051


In [48]:
nyt_df['material_type'].value_counts()

News                   34764
Op-Ed                   3141
Review                  2390
Interactive Feature     1731
briefing                1524
Obituary (Obit)         1126
Video                    710
Letter                   609
Slideshow                430
Quote                    287
Correction               212
Editorial                 92
List                      91
News Analysis             63
An Appraisal               1
Editors' Note              1
Biography                  1
Name: material_type, dtype: int64

In [50]:
nyt_df['doc_type'].value_counts()

article       44302
multimedia     3815
audio             9
Name: doc_type, dtype: int64

In [51]:
nyt_df[nyt_df['doc_type'] == 'article'].describe()

Unnamed: 0,word_count
count,44302.0
mean,1023.895806
std,816.650391
min,0.0
25%,558.0
50%,973.0
75%,1316.0
max,20573.0
