# Retrieving and Writing New York Times article into a JSON file 

### NYT API: Article search (the only section we need to extract all titles)
* **RESTRICTION-** the maximum number of articles that can be returned at once: 10
* Use the Article Search API to look up articles by keyword. You can refine your search using filters and facets.
* reference: https://ashar180.medium.com/crawling-ny-times-api-for-relevant-articles-b6134b651054

In [1]:
# !pip install requests

In [27]:
import json
import requests
import time #for delaying retrieval
from urllib.parse import quote

In [28]:
#need to get your own in New York Times API
api_key1 = 'your_own_api_key1'
api_key2 = 'your_friend_api_key2'
# api_key3 = 'own_key'
# api_key4 = 'own_key'
# api_key5 = 'own_key'
# api_key6 = 'own_key'
# api_key7 = 'own_key'
# api_key8 = 'own_key'
# api_key9 = 'own_key'
# api_key10 = 'own_key'
# api_key11 = 'own_key'
# api_key12 = 'own_key'
# api_key13 = 'own_key'
# api_key14 = 'own_key'
# api_key15 = 'own_key'
# api_key16 = 'own_key'
# api_key17 = 'own_key'
# api_key18 = 'own_key'
# api_key19 = 'own_key'

In [29]:
api_keys = [api_key1, api_key2] #, api_key3, api_key4, api_key5, api_key6, api_key7, api_key8, api_key9, api_key10, api_key11, api_key12, api_key13, api_key14, api_key15, api_key16, api_key17, api_key18, api_key19]

In [30]:
url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

In [31]:
#import titles from imdb
import pandas as pd
query_list = pd.read_csv('unique_titles.csv')

Querying only for movie titles with start year >2020 or null because we only want recent information since audience preferences may change over time:

In [32]:
#query only movies start year> 2000 or is null
default_year = 2000  # Replace '\\N' with this default value
query_list['startYear'] = query_list['startYear'].replace('\\N', 0)
query_list['startYear'] = query_list['startYear'].astype(int)

In [33]:
query_list = query_list.loc[query_list['startYear'] > 2020, 'primaryTitle']

In [34]:
len(query_list)/4000*3

32.16675

In [37]:

query_list = query_list

all_articles = []

def fetch_articles_with_api_key(api_key, movie_title):
    page_number = 0 #resets page number for each movie_title (define page by each movie instead of the whole list of movie titles)

    while True:
        encoded_title = quote(movie_title) 
      
        query_params = {
            "api-key": api_key,
            "q": encoded_title,
            "fq": "news_desk:Movies",
            "begin_date": 18510101, #start date of articles set to 1851 because articles before 1851 are arhchived, however, since movie titles are focused on >2020, the returned dates shouldn't be too long before 2020 (since there probably wouldn't be articles for the movie until planned production)
            "sort": "newest",
            "page": page_number  # Pagination, page 0 will return the first page of results
        }

        response = requests.get(url, params=query_params)

        # Process the data as per your requirements
        if response.status_code == 200: #request code 200 means the call was successful
            data = response.json()
            total_hits = data['response']['meta']['hits']
            print(f"Total results found for '{movie_title}':", total_hits)

            articles = data['response']['docs']

            #check if there's any articles left to loop through
            if not articles: #if there's no article returned aka no more articles for the movie, = true
                break #stop the loop 

            #loop to load article in json 
            for article in articles:
                # Initialize author to an empty string
                author = ""
                
                # Extract relevant fields from the article
                headline = article.get('headline', {}).get('main', '')
                abstract = article.get('abstract', '')
                lead_paragraph = article.get('lead_paragraph', '')
                snippet = article.get('snippet', '')
                pub_date = article.get('pub_date', '')
                news_desk = article.get('news_desk', '')
                web_url = article.get('web_url', '')
                source = article.get('source', '')
                
                persons = article.get('byline', {}).get('person', [])
                for person in persons:
                    first_name = person.get('firstname', '')
                    last_name = person.get('lastname', '')
                    if first_name and last_name:
                        author = f"{first_name} {last_name}"
                        break
                    elif first_name:
                        author = first_name
                        break
                    elif last_name:
                        author = last_name
                        break

                if not author:
                    author = "N/A"

                article_info = {
                    "Headline": headline,
                    "Abstract": abstract,
                    "Lead_Paragraph": lead_paragraph,
                    "Snippet": snippet,
                    "Published_date": pub_date,
                    "Author": author,
                    "News_desk": news_desk,
                    "URL": web_url,
                    "Source": source
                }

                all_articles.append(article_info)
           
            page_number += 1

            #delay in abstracting bc there's a limit on how many articles you can retrieve based on time
            time.sleep(1)  

        elif response.status_code == 429:
            # Rate limit exceeded, switch to the next API key
            print("Rate limit exceeded for current API key. Switching to the next one.")
            if api_keys:
                next_api_key = api_keys.pop(0)
                print("Using API key:", next_api_key)
                fetch_articles_with_api_key(next_api_key, movie_title)
            else:
                print("All API keys have been used. Exiting.")
                break

        else:
            print("Error:", response.status_code)
            break

for movie_title in query_list:
    for api_key in api_keys:
        fetch_articles_with_api_key(api_key, movie_title)


# Save the final version of all_articles to a JSON file after all movie titles are processed
with open("nyt_articles_all.json", "w") as json_file:
    json.dump(all_articles, json_file, indent=4)

Rate limit exceeded for current API key. Switching to the next one.
Using API key: Mq4LOxs0B3o0oHjxNH4WVXupXaHqCswB
Rate limit exceeded for current API key. Switching to the next one.
Using API key: Oe5z2TqFkBykOO9AKUmN7eIpfULuHTtl
Rate limit exceeded for current API key. Switching to the next one.
Using API key: 0HEa6jjsrRLPa6yYsQo1XdsYlHfhBwqg
Rate limit exceeded for current API key. Switching to the next one.
Using API key: dktNmBFJQdto0OA06CPQUdxWgwHrMsE9
Rate limit exceeded for current API key. Switching to the next one.
Using API key: btEcbjWBtm0lxibRaoQBJb0e4lT113iN
Rate limit exceeded for current API key. Switching to the next one.
Using API key: UwbYWW5yDf0r4o0xmK3qTwHUTlN4MdYg
Rate limit exceeded for current API key. Switching to the next one.
Using API key: ePx5Dc90Wz6cj8Lk3AqAskbCVz0d0nNu
Rate limit exceeded for current API key. Switching to the next one.
Using API key: ycFL8N0HJNIZslsdmknU5slzgnPmy0sJ
Rate limit exceeded for current API key. Switching to the next one.
Usin