In [None]:
#############
# Libraries #
#############

import numpy as np
import pandas as pd
import csv

#Time vars manipulations
import time
import datetime
from dateutil.rrule import rrule, MONTHLY

#for JSON requests and manipulations
import requests
import pyjq
from itertools import chain
import json

#web scrapper
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

In [None]:
#NYTimes developer key
key = YOUR_KEY

### Single query

In [None]:
#sends query
url = 'https://api.nytimes.com/svc/archive/v1/2006/10.json?api-key='+key
req = requests.get(url)
json_data = req.json()

In [None]:
#saves JSON file
with open("json_data.json", "w") as write_file:
    json.dump(json_data, write_file)

In [None]:
# number of articles per query (month)
num_docs = pyjq.all('.response .docs | length', json_data)[0]

In [None]:
# extracts particular information (variables)
jq_query = f'.response .docs [] | {{n_url: .web_url, snippet: .snippet, paragraph: .lead_paragraph, mult: .multimedia[] | .url, headline: .headline .main, keyword: .keywords, date: .pub_date, doc_type: .document_type, news_desk: .news_desk, section: .section_name, subsectoin: .subsectoinName, author: .byline .original, id: ._id, word_count: .word_count}}'


In [None]:
# returns dict with data
output = pyjq.all(jq_query, json_data)

In [None]:
# to dataframe
df = pd.DataFrame(output)

### Loop for automated process 

In [None]:
#create list of (year, month) pairs
#(year,month,day)

start_dt = datetime.date(2006,1,1)
end_dt = datetime.date(2020,7,1)

dates = [(dt.year, dt.month) for dt in rrule(MONTHLY, dtstart=start_dt, until=end_dt)]

In [None]:
#creates an empty datasets with certain columns
df = pd.DataFrame(columns = ['n_url', 'snippet', 'lead_paragraph', 'image', 'headline', 'date',
       'doc_type', 'news_desk', 'section', 'author', 'id', 'word_count'])

#loop for extracting data for month/year pairs
for year, month in tqdm(dates):
    time.sleep(20) #to prevent attacks
    print(year, month)
    url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={key}'
    r = requests.get(url)
    js_data = r.json()
    
    num_docs = pyjq.all('.response .docs | length', js_data)[0]
    print(f'For month {month} in {year} there were {num_docs} articles')
    
    #extract required variables
    jq_q = f'.response .docs [] | {{n_url: .web_url, snippet: .snippet, lead_paragraph: .lead_paragraph, image: .multimedia[1].url, headline: .headline .main, date: .pub_date, doc_type: .document_type, news_desk: .news_desk, section: .section_name, author: .byline, id: ._id, word_count: .word_count}}'
    out = pyjq.all(jq_q, js_data)
    
    #to dataframe + appending
    g = pd.DataFrame(out)
    df = df.append(g,  sort=False)

In [None]:
#makes image urls clickable
#and attributes none (NaN) for articles without pictures
df['image'] = 'https://static01.nyt.com/' + df['image'].astype(str) + '?quality=90&auto=webp'

In [None]:
df.to_csv('NYT_data.csv', index=False)

### Web scrapping (full article texts)

In [None]:
df = pd.read_csv('NYT_data.csv')

In [None]:
def get_full_text(df):
    
    df['full_text'] = 'NaN'
    session = requests.Session()
    
    for j in tqdm(range(0, len(df))):
        print(j)
        try:
            url = df['n_url'][j]
            req = session.get(url)
            soup = BeautifulSoup(req.text, 'lxml')
        except Exception as e:
            print(e)
        
        #Extracting all HTML text under tag 'p'
        tags = soup.find_all('p')
        if tags == []:
            tags = soup.find_all('p', itemprop = 'articleBody')

        # Joining HTML text
        article = ''
        for p in tags:
            article = article + ' ' + p.get_text()
            article = " ".join(article.split())

        # Text to the DataFrame
        df['full_text'][j] = article

    return df                          

In [None]:
get_full_text(df)

In [None]:
df.to_csv('NYT_data_text.csv', index=False)