Use the pynytimes library to retrieve New York Times new headlines for the WPI Machine Learning course final project.

Links to referenced package and API:
* https://pypi.org/project/pynytimes/
* https://developer.nytimes.com/apis


In [None]:
pip install --upgrade pynytimes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pynytimes
  Downloading pynytimes-0.8.0-py3-none-any.whl (21 kB)
Installing collected packages: pynytimes
Successfully installed pynytimes-0.8.0


In [None]:
import time
from pynytimes import NYTAPI
import datetime
from datetime import timedelta
import pandas as pd

In [None]:
# Set the number of desired query results
NUM_RESULTS = 30

In [None]:
# Uses the max number of query results to build out the column names for
# our dataframe of headlines

column_names = {0: 'Date'}

for i in range(NUM_RESULTS):
  column_names[i+1] = f'{"News "}{i}'

print(column_names)

{0: 'Date', 1: 'News 0', 2: 'News 1', 3: 'News 2', 4: 'News 3', 5: 'News 4', 6: 'News 5', 7: 'News 6', 8: 'News 7', 9: 'News 8', 10: 'News 9', 11: 'News 10', 12: 'News 11', 13: 'News 12', 14: 'News 13', 15: 'News 14', 16: 'News 15', 17: 'News 16', 18: 'News 17', 19: 'News 18', 20: 'News 19', 21: 'News 20', 22: 'News 21', 23: 'News 22', 24: 'News 23', 25: 'News 24', 26: 'News 25', 27: 'News 26', 28: 'News 27', 29: 'News 28', 30: 'News 29'}


Sign up for an API key at https://developer.nytimes.com/apis

In [None]:

apikey = "YOUR KEY HERE"

nyt = NYTAPI(apikey, parse_dates=True)

In [None]:
# set the news desk values used to filter the articles to
# data more relevant to the equities markets
news_desk = [
    "Business Day",
    "Business",
    "Financial",
    "National",
    "Personal Investing",
    "Politics",
    "U.S.",
    "World"
]

In [None]:
# Section name is another filter available to the API, but it
# appears to work intermittently. I used news desk instead.
section_name = [
    "U.S.",
    "World"               
]

In [None]:
# This function takes in the results of a NYT query and a date variable.
# it extracts only the article headlines and creates a list of strings
# which starts with the date and then includes all of the headlines.
def extract_headlines(articles, date):
  headlines = [date]

  for article in articles:
    headlines.append(article["headline"]["main"])

  return(headlines)


In [None]:
# This is a single search which I have commented out so it isn't run during a 
# "run all". I've retained it for reference 

# articles = nyt.article_search(
#     # query = "",
#     results = 30,
#     dates = {
#         "begin": datetime.datetime(2018, 1, 31),
#         "end": datetime.datetime(2022, 3, 31)
#     },
#     options = {
#         # "sort": "oldest",
#         "sources": [
#             "New York Times",
#             "AP",
#             "Reuters"
#         ],
#         "section_name": section_name
#         # "news_desk": news_desk
#         # "type_of_material": types_of_material
#     }
# )
# len(articles)

# Intended to run overnight


In [None]:
# initialize the data list
archive_news = []

# set a start date and end date for the query
start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2018, 1, 3)

# create a timedelta object to increment the loop 1 day at a time
delta = timedelta(days=1)

# set progress to 0 and calculate the total number of requests. 
# Used for convenience to let us know long the query still has to run
progress = 0
tot_days = (end_date - start_date).days

while start_date <= end_date:
  # print status of these requests
  print(start_date.strftime("%Y-%m-%d"), progress/tot_days*100, "%")
  progress += 1
  
  # insert a 6 second delay to stay under the API Max requests (10/minute) 
  time.sleep(6) 

  # Build and execute the query
  articles = nyt.article_search(
    # query = "",
    results = NUM_RESULTS,
    dates = {
        "begin": start_date,
        "end": start_date
    },
    options = {
        # "sort": "oldest",
        "sources": [
            "New York Times",
            "AP",
            "Reuters"
        ],
        # "section_name": section_name,
        "news_desk": news_desk
        # "type_of_material": types_of_material
    }
    )
  
  # append the results from this iteration before starting the next
  archive_news.append(extract_headlines(articles, start_date))
  start_date += delta

# convert the list to a pandas dataframe
# use the column names we created earlier to improve readability
df = pd.DataFrame(archive_news)
df=df.rename(columns=column_names)

# Use Google Colab capabilities to save the dataframe to the VM and download it
from google.colab import files
filename = "NYTNews test.csv"

df.to_csv(filename, encoding = 'utf-8-sig', index=False) 
files.download(filename)


2018-01-01 0.0 %
2018-01-02 50.0 %
2018-01-03 100.0 %


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# from google.colab import files

# infiles = ["NYTNews 1H2020.csv", 
#          "NYTNews 1H2020.csv",
#          "NYTNews 1H2020.csv",
#          "NYTNews 1H2020.csv",
#          "NYTNews 1H2020.csv",
#          "NYTNews 1H2020.csv",  
#          ]

# outfile = "NYTNews.csv"

# df = pd.DataFrame()

# for file in infiles:
#   # tempDf = pd.read_csv(file, index_col='Date')
#   tempDf = pd.read_csv(file)

#   frames = [df, tempDf]
#   df = pd.concat(frames)
#   print(len(df))


# df.to_csv(outfile, encoding = 'utf-8-sig', index=False) 
# files.download(outfile)