In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 100
from bs4 import BeautifulSoup
import requests
import time
import datetime
nyt_api = 'key goes here'

Gathering full text from the new york times is a bit of a roundabout process. The NYT API is reasonably straightforward, but it won't give you the full text of the article. See below for an example of the JSON file you get from a query to the API: down at the bottom a 'meta' portion telling you how many hits there were for the particular query, and above, a 'docs' section with info for up to ten of the articles found. You only get a snippet, a lead paragraph and an abstract, though, not a full text (and some times, depending on what sort of article it is, the lead paragraph and the abstract can be very short - "These were the key takeaways from the meeting", or something like that).

In [22]:
fq= 'fq=netflix&begin_date=20190717&end_date=20190717&page=1'
nyt_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={fq}&api-key={nyt_api}'
resp2 = requests.get(nyt_url)
resp2.json()['response']

{'docs': [{'web_url': 'https://www.nytimes.com/2019/07/17/briefing/heat-wave-congress-national-parks-your-wednesday-evening-briefing.html',
   'snippet': 'Here’s what you need to know at the end of the day.',
   'lead_paragraph': '(Want to get this briefing by email? Here’s the sign-up.) ',
   'abstract': 'Here’s what you need to know at the end of the day.',
   'blog': {},
   'source': 'The New York Times',
   'multimedia': [{'rank': 0,
     'subtype': 'xlarge',
     'caption': None,
     'credit': None,
     'type': 'image',
     'url': 'images/2019/07/17/briefing/c17PMbriefing-us-ss-promo/17PMbriefing-us-ss-slide-Q6HA-articleLarge.jpg',
     'height': 400,
     'width': 600,
     'legacy': {'xlarge': 'images/2019/07/17/briefing/c17PMbriefing-us-ss-promo/17PMbriefing-us-ss-slide-Q6HA-articleLarge.jpg',
      'xlargewidth': 600,
      'xlargeheight': 400},
     'subType': 'xlarge',
     'crop_name': 'articleLarge'},
    {'rank': 0,
     'subtype': 'popup',
     'caption': None,
     '

Luckily, included in the NYT API response is the full url for the article, so we can scrape the full text in a different process.

So, this is a two step process. First step is compiling a list of all the urls, and then step two is going through and scraping each article individually.

In [3]:
def article_url_getter(company, date):
    time.sleep(6) #The NYT API only allows 10 calls a minute. It's important to have a pause up front
    #because if it hits a couple of days in a row with no articles, 
    #it'll make those requests one right after eachother
    fq= f'fq={company}&begin_date={date}&end_date={date}'
    nyt_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={fq}&api-key={nyt_api}'
    resp = requests.get(nyt_url)
    urls = [] #ultimately will return a list of urls
    if resp.json()['response']['meta']['hits']==0:
        return [] #if there are no articles, just return an empty list (I take care of this later)
    if resp.json()['response']['meta']['hits']<11: #need to check if we'll need more than one call
        for doc in resp.json()['response']['docs']:
            urls.append(doc['web_url'])
    if resp.json()['response']['meta']['hits']>11: #Each call can only return ten articles
        if resp.json()['response']['meta']['hits'] % 10 ==0:
            num_pages= int(resp.json()['response']['meta']['hits']/10)
        else:
            num_pages= int(resp.json()['response']['meta']['hits']//10) + 1
        for doc in resp.json()['response']['docs']:
            urls.append(doc['web_url'])
        for page in range(0,num_pages):
            time.sleep(4)
            fq= f'fq={company}&begin_date={date}&end_date={date}&page={page}'
            nyt_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={fq}&api-key={nyt_api}'
            resp = requests.get(nyt_url)
            for doc in resp.json()['response']['docs']:
                urls.append(doc['web_url'])
    return urls
    

In [31]:
test_article_list = article_url_getter('netflix','20190717')
test_article_list

['https://www.nytimes.com/2019/07/17/arts/television/whats-on-tv-wednesday-suits-and-babylon.html',
 'https://www.nytimes.com/reuters/2019/07/17/business/17reuters-netflix-results.html',
 'https://www.nytimes.com/aponline/2019/07/17/business/ap-us-netflix-results.html',
 'https://www.nytimes.com/2019/07/17/business/media/netflix-earnings-subscribers.html',
 'https://www.nytimes.com/reuters/2019/07/17/arts/17reuters-television-gossip-girl.html',
 'https://www.nytimes.com/2019/07/17/arts/dance/thom-yorke-anima-dance.html',
 'https://www.nytimes.com/aponline/2019/07/17/business/ap-us-business-highlights.html',
 'https://www.nytimes.com/2019/07/17/nyregion/el-chapo-sentencing.html',
 'https://www.nytimes.com/reuters/2019/07/17/technology/17reuters-usa-stocks-fang.html',
 'https://www.nytimes.com/reuters/2019/07/17/business/17reuters-talktalk-outlook.html',
 'https://www.nytimes.com/2019/07/17/arts/television/whats-on-tv-wednesday-suits-and-babylon.html',
 'https://www.nytimes.com/reuters/2

And now to scrape the articles themselves, cleaned up a little to return all the paragraphs together as one string

In [2]:
def scrape_articles_text(url):

    session = requests.Session()

    req = session.get(url)
    soup = BeautifulSoup(req.text, 'lxml')

    paragraph_tags = soup.find_all('p', class_= 'css-exrw3m evys1bk0')
    if paragraph_tags == []:
        paragraph_tags = soup.find_all('p', itemprop = 'articleBody')

    article = ''
    for p in paragraph_tags:
        article = article + ' ' + p.get_text()

    # Clean article replacing unicode characters
    article = article.replace(u'\u2018', u"'").replace(u'\u2019', u"'").replace(u'\u201c', u'"').replace(u'\u201d', u'"')

    return article

In [4]:
#this just iterates through a list and returns all of the urls scraped, with the text in a list
def article_scraper(url_list):
    texts = []
    for url in url_list:
        texts.append(scrape_articles_text(url))
    return texts

In [49]:
article_list = article_scraper(test_article_list)
article_list

[' SUITS 9 p.m. on USA Network. This long-running legal drama begins its ninth and final season. Since the departure of Mike Ross (Patrick J. Adams), his wife, Rachel Zane (Meghan Markle), and the law firm\'s head honcho, Jessica Pearson (Gina Torres) in Season 7, fans have watched the series mainstays, Harvey Specter (Gabriel Macht), Louis Litt (Rick Hoffman) and Donna Paulsen (Sarah Rafferty) vie for control of the firm. And while the Duchess of Sussex will in all likelihood not be back, fans can expect to see Adams return as a guest star in the show\'s final 10 episodes. PEARSON 10 p.m. on USA Network. As "Suits" embarks on its final season, one of the show\'s characters finds footing in her own spinoff. Ms. Torres reprises her role as Jessica Pearson, a former top lawyer who\'s now working as a political fixer for the new mayor of Chicago, played by the actor Morgan Spector. In an interview in The New York Times, Ms. Torres opened up about her character\'s new chapter. "What I love

And finally, to actually build up a dataframe of news articles, indexed by date. First, a date formatting function which takes a datetime object (which can be easily incremented by one day as you iterate through the year) and returns a string in the proper format for the NYT API. Then the final df_maker function, which iterates through every day, generates a dataframe of texts and the compiles them into one master, which it returns.

In [5]:
def date_formatter(datetime_obj):
    year = str(datetime_obj.year)
    if len(str(datetime_obj.month))==1:
        month = '0'+str(datetime_obj.month)
    else:
        month = str(datetime_obj.month)
        
    if len(str(datetime_obj.day))==1:
        day = '0'+str(datetime_obj.day)
    else:
        day = str(datetime_obj.day)
    return year+month+day

In [6]:
def company_df_maker(company, start, num_days):

    df = pd.DataFrame(columns=['date','text'])
    for n in range(0,num_days):
        date = start + datetime.timedelta(n)
        text_date = date_formatter(start + datetime.timedelta(n))
        url_list = article_url_getter(company,text_date)
        if len(url_list) != 0: #If we had no articles, it just returns an empty list
            articles = article_scraper(url_list)
            temp = pd.DataFrame(columns=['date','text'])
            temp['date'] = [date]*len(articles)
            temp['text'] = articles
            df = pd.concat([df,temp])

    return df
        

A test with just three days and then a full year, saved to csv. When compiling the full dataframes, I did not run it in the notebook! I had a seperate .py file that I run in the terminal, this is just an example. Between the sleeping in between API calls and the time it takes to scrape the articles, one year of Netflix stories took more than an hour to run (And some companies with more stores take even longer!).

In [None]:
start = datetime.date(2017,1,1)

In [72]:
netflix_df = company_df_maker('netflix',start, 3)
netflix_df

Unnamed: 0,date,text
0,2017-01-01,"Rejoice: After three years away, the hit BBC ..."
0,2017-01-02,"UTSUNOMIYA, Japan — In the sofa-appointed lis..."
0,2017-01-03,"Ladies and gentlemen, please, please take you..."
1,2017-01-03,


In [73]:
netflix_df = company_df_maker('netflix',start, 365)

In [77]:
netflix_df.to_csv('netflix.csv')

In [94]:
netflix_df

Unnamed: 0,date,text
0,2017-01-01,"Rejoice: After three years away, the hit BBC ..."
0,2017-01-02,"UTSUNOMIYA, Japan — In the sofa-appointed lis..."
0,2017-01-03,"Ladies and gentlemen, please, please take you..."
1,2017-01-03,
0,2017-01-04,"In the first episode of ""One Day at a Time,"" ..."
1,2017-01-04,"Hulu, one of several companies vying to creat..."
2,2017-01-04,A few cliffhangers left viewers dangling in t...
3,2017-01-04,Sam Sifton emails readers of Cooking five day...
0,2017-01-05,When the director Davis Guggenheim was comple...
1,2017-01-05,
