# Retrieving and Writing New York Times article into a JSON file 
* Reference: https://medium.com/@danalindquist/using-new-york-times-api-and-jq-to-collect-news-data-a5f386c7237b

In [25]:
# !pip install requests

In [26]:
import requests

In [None]:
#need to get your own in New York Times API
api_key = your_own_key

### NYT API: Article search (the only section we need to extract all titles)
* **RESTRICTION-** the maximum number of articles that can be returned at once: 10
* Use the Article Search API to look up articles by keyword. You can refine your search using filters and facets.
* reference: https://ashar180.medium.com/crawling-ny-times-api-for-relevant-articles-b6134b651054
* did not use movie review search bc it doesn't include context or URLs

In [68]:
import json
import requests
import time #for delaying retrieval
from urllib.parse import quote

In [69]:
# url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?q=' + query + '&api-key=' + api_key

In [70]:
#need to get your own in New York Times API
api_key = 'your_own_api'

In [71]:
url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

In [72]:
#import titles from imdb
import pandas as pd
query_list = pd.read_csv('unique_titles.csv')

In [73]:
#query only movies start year> 2000 or is null
default_year = 2000  # Replace '\\N' with this default value
query_list['startYear'] = query_list['startYear'].replace('\\N', 0)
query_list['startYear'] = query_list['startYear'].astype(int)

Querying only for movie titles with start year >2020 because we only want recent information since audience preferences may change over time:

In [74]:
query_list = query_list.loc[query_list['startYear'] > 2020, 'primaryTitle']

In [75]:
len(query_list)

42889

In [76]:
query_list.head(1000)

6549               Istoriya grazhdanskoy voyny
52028        Histórias de Combóios em Portugal
64136                           Loading Ludwig
67526                    Bell of Purity Temple
68272                       Neues in Wittstock
                          ...                 
222746                      A Strippers Prayer
222749                      Mission Concepción
222774                         Beyond the Neon
222779    Casanova B - All you need i$ Lov(e)a
222786                                Rankolla
Name: primaryTitle, Length: 1000, dtype: object

In [77]:
query_list = query_list

all_articles = []

for movie_title in query_list: 
    
    page_number = 0

    while True: 
    
       # encoded_title = quote(movie_title)
        
        query_params = {
            "api-key": api_key,
            "q": movie_title,
            "fq": "news_desk:Movies",
            "begin_date": 18510101, #start date of articles set to 1851 because articles before 1851 are arhchived, however, since movie titles are focused on >2020, the returned dates shouldn't be too long before 2020 (since there probably wouldn't be articles for the movie until planned production)
            "sort": "newest",
            "page": page_number,  # Pagination, page 0 will return the first page of results
            "news_desk": "arts"
            #can set "fq": field("what to serach") to restrict fields to have specific values in the query 
        }

        response = requests.get(url, params=query_params)

        # Process the data as per your requirements
        if response.status_code == 200: #request code 200 means the call was successful
            data = response.json()
            total_hits = data['response']['meta']['hits']
            print(total_hits)

            articles = data['response']['docs']

            #check if there's any articles left to loop through
            if not articles: #if article returned is NOT empty aka if there's no articles (if empty or is zero, returns false)
                break #stop the loop 

            #loop to load article in json 
            for article in articles:
                # Initialize author to an empty string
                author = ""
                
                # Extract relevant fields from the article
                headline = article.get('headline', {}).get('main', '')
                abstract = article.get('abstract', '')
                lead_paragraph = article.get('lead_paragraph', '')
                snippet = article.get('snippet', '')
                pub_date = article.get('pub_date', '')
                news_desk = article.get('news_desk', '')
                web_url = article.get('web_url', '')
                source = article.get('source', '')
                
                persons = article.get('byline', {}).get('person', [])
                for person in persons:
                    first_name = person.get('firstname', '')
                    last_name = person.get('lastname', '')
                    if first_name and last_name:
                        author = f"{first_name} {last_name}"
                        break
                    elif first_name:
                        author = first_name
                        break
                    elif last_name:
                        author = last_name
                        break

                if not author:
                    author = "N/A"

                article_info = {
                    "Headline": headline,
                    "Abstract": abstract,
                    "Lead_Paragraph": lead_paragraph,
                    "Snippet": snippet,
                    "Published_date": pub_date,
                    "Author": author,
                    "News_desk": news_desk,
                    "URL": web_url,
                    "Source": source
                }

                all_articles.append(article_info)
           
            page_number += 1

            #delay in abstracting bc there's a limit on how many articles you can retrieve based on time
            time.sleep(1)  

        elif response.status_code == 429:
            # Rate limit exceeded, retry after suggested duration (if provided) or with an increased delay
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                time.sleep(int(retry_after))
            else:
                time.sleep(5)  # delay by 5 seconds if no retry duration is specified by the server 

        else:
            print("Error:", response.status_code)
            break

#save articles in a json file 
with open("nyt_articles.json", "w") as json_file:
    json.dump(all_articles, json_file, indent=4)

0
0
0
0
0
1
1
20
20
20
2
2
3
3
0
1
1
0
0
1
1
0
1
1
0
0
0
0
0
0
0
2
2
1
1
1
1
2
2
0
4
4
0
23
23
23
23
0
0
0
1
1
4
4
8
8
0
0
0
1
1
0
1
1
3
3
0
2
2
5
5
0
0
1
1
0
7
7
4
4
9
9
1
1
63
63
63
63
63
63
63
63
1
1
2
2
0
0
42
42
42
42
42
42
2
2
1
1
0
5
5
0
7
7
0
21
21
21
21
0
2
2
3
3
0
0
6
6
12
12
12
0
0
0
0
0
0
0
0
9
9
0
25
25
25
25
0
0
0
0
0
0
3
3
0
0
0
6
6
7
7
1
1
0
0
3
3
0
0
3
3
0
111
111
111
111
111
111
111
111
111
111
111
111
111
0
5
5
2
2
0
0
1
1
0
0
2
2
0
8
8
16
16
16
0
0
2
2
6
6
0
0
105
105
105
105
105
105
105
105
105
105
105
105
0
1
1
0
34
34
34
34
34
0
17
17
17
49
49
49
49
49
49
0
0
0
1
1
4
4
1
1
1
1
1
1
50
50
50
50
50
50
13
13
13
0
34
34
34
34
34
1
1
0
0
0
0
2
2
0
0
14
14
14
0
0
2
2
2
2
0
0
2
2
93
93
93
93
93
93
93
93
93
93
93
4
4
1
1
12
12
12
21
21
21
21
0
0
0
1
1
0
1
1
0
0
0
56
56
56
56
56
56
56
10
10
0
0
0
5
5
4
4
0
1
1
0
0
2
2
1
1
11
11
11
4
4
0
0
6
6
0
11
11
11
15
15
15
3
3
20
20
20
0
2
2
0
7
7
2
2
0
1
1
14
14
14
30
30
30
30
0
0
0
0
0
0
0


KeyboardInterrupt: 

In [78]:
with open("nyt_articles2.json", "w") as json_file:
    json.dump(all_articles, json_file, indent=4)

To view the data in the first page (10 headlines) based on one title and not load in json:

In [25]:
#type in what to query in articles
text = "mission_impossible"
query = quote(text)

In [27]:
query_params = {
    "api-key": api_key,
    "q": query,
    "sort": "newest",
    "page": 0,  # Pagination, page 0 will return the first page of results
}

response = requests.get(url, params=query_params)

# Process the data as per your requirements
if response.status_code == 200: #request code 200 means the call was successful
    data = response.json()
    
    total_hits = data['response']['meta']['hits']
    print(total_hits)
    
    articles = data['response']['docs']
    for article in articles:
        
        print("-----")
        headline = article.get('headline', {}).get('main', '')
        snippet = article.get('snippet', '')
        web_url = article.get('web_url', '')

        if headline:
            print("Headline:", headline)

        if snippet:
            print("Snippet:", snippet)

        if web_url:
            print("URL:", web_url)

        #print(article['abstract'])
        #print("Author:", article['byline']['person'][0]['firstname'] + " " + article['byline']['person'][0]['lastname'])
        
        #print(article)
            #print("Title:", article[i]['display_title'])
    #print(data)
else:
    print("Error:", response.status_code)

9759
-----
Headline: On Iran, Reagan's Options Don't Seem So Good, Either; No Support for New Sanctions An Array of Unsatisfying Choices
URL: https://www.nytimes.com/1980/12/28/archives/on-iran-reagans-options-dont-seem-so-good-either-no-support-for-new.html
-----
Headline: Saturday; Cable/Subscription TV Of Special Interest
URL: https://www.nytimes.com/1980/12/28/archives/saturday-cablesubscription-tv-of-special-interest.html
-----
Headline: Television
URL: https://www.nytimes.com/1980/12/27/archives/television.html
-----
Headline: Churches Sound Ancient Message of Bethlehem; 'A Greater Awareness' 'Hear the Cries of the Poor' Churches Affirm Message of Peace on Earth and Good Will to All 'We Need Christmas' 'God Comes From Within' 'Think at Once of Family' 'You Will Respond' 'Go On With Your Life' 'Hints of Eternity' 'Time of Celebration' 'Love Is Visionary' 
URL: https://www.nytimes.com/1980/12/25/archives/churches-sound-ancient-message-of-bethlehem-a-greater-awareness.html
-----
Hea

In [None]:
type(data)

In [None]:
data['response']['docs'][1]