### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
from pandas import json_normalize
import json

In [2]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_KEY")
tmdb_api_key = os.getenv("TMDB_KEY")

### Access the New York Times API

In [3]:
# Set the base URL
nyt_base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
nyt_query_url = (
    f"{nyt_base_url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
    + f'&fq={filter_query}&sort={sort}&fl={field_list}'
)

In [24]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
for page_of_reviews in range(0,1):
    # create query with a page number
    # API results show 10 articles at a time
    nyt_query_url = nyt_query_url + f"&page={page_of_reviews + 1}"
    
    # Make a "GET" request and retrieve the JSON
    response = requests.get(nyt_query_url)
    page_of_reviews = response.json()
    # Add a twelve second interval between queries to stay within API query limits
    # placing this after the other for loop
    # Print the page number that had no results then break from the loop
    if len(page_of_reviews["response"]["docs"]) == 0:
        print(f"Page {page} had no reviews")
        break
    # Try and save the reviews to the reviews_list
    for review in page_of_reviews["response"]["docs"]:
        # loop through the reviews["response"]["docs"] and append each review to the list
        reviews_list.append(review)
        # Print the page that was just retrieved
        print(json.dumps(page_of_reviews, indent=4))

        # Print the page number that had no results then break from the loop
        # This makes no sense to put in here, placing that above.
    time.sleep(12)

{
    "status": "OK",
    "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
    "response": {
        "docs": [
            {
                "web_url": "https://www.nytimes.com/2023/01/31/movies/pamela-a-love-story-review.html",
                "snippet": "This documentary from Ryan White rewinds, to powerful effect, on Pamela Anderson\u2019s life and fame.",
                "source": "The New York Times",
                "headline": {
                    "main": "\u2018Pamela, a Love Story\u2019 Review: A Frank Look Back",
                    "kicker": null,
                    "content_kicker": null,
                    "print_headline": "Pamela, a Love Story",
                    "name": null,
                    "seo": null,
                    "sub": null
                },
                "keywords": [
                    {
                        "name": "subject",
                        "value": "Documentary Films and Programs",
             

In [25]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
    # Why am I previewing data after saving it :(
print(json.dumps(reviews_list[:5], indent=4))

[
    {
        "web_url": "https://www.nytimes.com/2023/01/31/movies/pamela-a-love-story-review.html",
        "snippet": "This documentary from Ryan White rewinds, to powerful effect, on Pamela Anderson\u2019s life and fame.",
        "source": "The New York Times",
        "headline": {
            "main": "\u2018Pamela, a Love Story\u2019 Review: A Frank Look Back",
            "kicker": null,
            "content_kicker": null,
            "print_headline": "Pamela, a Love Story",
            "name": null,
            "seo": null,
            "sub": null
        },
        "keywords": [
            {
                "name": "subject",
                "value": "Documentary Films and Programs",
                "rank": 1,
                "major": "N"
            },
            {
                "name": "persons",
                "value": "Anderson, Pamela (1967- )",
                "rank": 2,
                "major": "N"
            },
            {
                "name": "persons",

In [6]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
df = json_normalize(reviews_list)
df.head()

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/01/31/movies/pame...,"This documentary from Ryan White rewinds, to p...",The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2023-01-31T12:00:05+0000,295,"‘Pamela, a Love Story’ Review: A Frank Look Back",,,"Pamela, a Love Story",,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",
1,https://www.nytimes.com/2023/01/19/movies/in-f...,"In Matt Carter’s gay rugby film, sports and ro...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T17:50:16+0000,281,‘In From the Side’ Review: Love and Rugby Play...,,,In From the Side,,,,By Kyle Turner,"[{'firstname': 'Kyle', 'middlename': None, 'la...",
2,https://www.nytimes.com/2023/01/19/movies/afte...,In this intelligent melodrama by the director ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T12:00:06+0000,359,‘After Love’ Review: The Other Woman,Critic’s Pick,,After Love,,,,By Beatrice Loayza,"[{'firstname': 'Beatrice', 'middlename': None,...",
3,https://www.nytimes.com/2023/01/05/movies/alca...,"In this naturalistic drama from Spain, a famil...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-05T12:00:03+0000,306,‘Alcarràs’ Review: Labor of Love,,,Alcarràs,,,,By Devika Girish,"[{'firstname': 'Devika', 'middlename': None, '...",
4,https://www.nytimes.com/2022/12/15/movies/nell...,A family archive provides intimate records of ...,The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2022-12-15T12:00:04+0000,308,"‘Nelly & Nadine’ Review: An Unlikely Love, an ...",,,Nelly &amp; Nadine,,,,By Teo Bugbee,"[{'firstname': 'Teo', 'middlename': None, 'las...",


In [7]:
# Extract the title from the "headline.main" column and
# df.columns
# save it to a new column "title"
df["title"] = df["headline.main"]
# df.head()
# Title is between unicode characters \u2018 and \u2019. 
def extract_content(input_string):
    start_index = input_string.find('\u2018')
    end_index = input_string.find(' Review') + len(' Review')
    
    if start_index != -1 and end_index != -1:
        return input_string[start_index:end_index]
    else:
        return "Pattern not found."
df["title"] = df["title"].apply(extract_content)
df.head()
# End string should include " Review" to avoid cutting title early

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,https://www.nytimes.com/2023/01/31/movies/pame...,"This documentary from Ryan White rewinds, to p...",The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2023-01-31T12:00:05+0000,295,"‘Pamela, a Love Story’ Review: A Frank Look Back",,,"Pamela, a Love Story",,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",,"‘Pamela, a Love Story’ Review"
1,https://www.nytimes.com/2023/01/19/movies/in-f...,"In Matt Carter’s gay rugby film, sports and ro...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T17:50:16+0000,281,‘In From the Side’ Review: Love and Rugby Play...,,,In From the Side,,,,By Kyle Turner,"[{'firstname': 'Kyle', 'middlename': None, 'la...",,‘In From the Side’ Review
2,https://www.nytimes.com/2023/01/19/movies/afte...,In this intelligent melodrama by the director ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T12:00:06+0000,359,‘After Love’ Review: The Other Woman,Critic’s Pick,,After Love,,,,By Beatrice Loayza,"[{'firstname': 'Beatrice', 'middlename': None,...",,‘After Love’ Review
3,https://www.nytimes.com/2023/01/05/movies/alca...,"In this naturalistic drama from Spain, a famil...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-05T12:00:03+0000,306,‘Alcarràs’ Review: Labor of Love,,,Alcarràs,,,,By Devika Girish,"[{'firstname': 'Devika', 'middlename': None, '...",,‘Alcarràs’ Review
4,https://www.nytimes.com/2022/12/15/movies/nell...,A family archive provides intimate records of ...,The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2022-12-15T12:00:04+0000,308,"‘Nelly & Nadine’ Review: An Unlikely Love, an ...",,,Nelly &amp; Nadine,,,,By Teo Bugbee,"[{'firstname': 'Teo', 'middlename': None, 'las...",,‘Nelly & Nadine’ Review


In [8]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
df["keywords"] = df["keywords"].apply(extract_keywords)
df.head()

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,https://www.nytimes.com/2023/01/31/movies/pame...,"This documentary from Ryan White rewinds, to p...",The New York Times,subject: Documentary Films and Programs;person...,2023-01-31T12:00:05+0000,295,"‘Pamela, a Love Story’ Review: A Frank Look Back",,,"Pamela, a Love Story",,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",,"‘Pamela, a Love Story’ Review"
1,https://www.nytimes.com/2023/01/19/movies/in-f...,"In Matt Carter’s gay rugby film, sports and ro...",The New York Times,subject: Movies;creative_works: In From the Si...,2023-01-19T17:50:16+0000,281,‘In From the Side’ Review: Love and Rugby Play...,,,In From the Side,,,,By Kyle Turner,"[{'firstname': 'Kyle', 'middlename': None, 'la...",,‘In From the Side’ Review
2,https://www.nytimes.com/2023/01/19/movies/afte...,In this intelligent melodrama by the director ...,The New York Times,subject: Movies;creative_works: After Love (20...,2023-01-19T12:00:06+0000,359,‘After Love’ Review: The Other Woman,Critic’s Pick,,After Love,,,,By Beatrice Loayza,"[{'firstname': 'Beatrice', 'middlename': None,...",,‘After Love’ Review
3,https://www.nytimes.com/2023/01/05/movies/alca...,"In this naturalistic drama from Spain, a famil...",The New York Times,"subject: Movies;persons: Simon, Carla;creative...",2023-01-05T12:00:03+0000,306,‘Alcarràs’ Review: Labor of Love,,,Alcarràs,,,,By Devika Girish,"[{'firstname': 'Devika', 'middlename': None, '...",,‘Alcarràs’ Review
4,https://www.nytimes.com/2022/12/15/movies/nell...,A family archive provides intimate records of ...,The New York Times,subject: Documentary Films and Programs;person...,2022-12-15T12:00:04+0000,308,"‘Nelly & Nadine’ Review: An Unlikely Love, an ...",,,Nelly &amp; Nadine,,,,By Teo Bugbee,"[{'firstname': 'Teo', 'middlename': None, 'las...",,‘Nelly & Nadine’ Review


In [9]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
titles_list = df["title"].to_list()
print(titles_list[:5])

['‘Pamela, a Love Story’ Review', '‘In From the Side’ Review', '‘After Love’ Review', '‘Alcarràs’ Review', '‘Nelly & Nadine’ Review']


### Access The Movie Database API

In [21]:
# Prepare The Movie Database query
tmdb_movie_query_url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

In [29]:
# Create an empty list to store the results
tmdb_movies_list = []

# Create a request counter to sleep the requests after a multiple
# of 50 requests
request_counter = 0
def batch_requests(batch_size):
    if request_counter % batch_size == 0 and request_counter != 0:
        time.sleep(12)
    

# Loop through the titles
for index, title in enumerate(titles_list):
    # Check if we need to sleep before making a request
    batch_requests(index + 1)

    # Add 1 to the request counter
    request_counter+=1
    
    # Perform a "GET" request for The Movie Database
    tmdb_query_url = tmdb_movie_query_url+title+tmdb_key_string
    tmdb_query_response = requests.get(tmdb_query_url)
    tmdb_query_response = tmdb_query_response.json()

    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.
    try:
        # Get movie id
        movie_id = tmdb_query_response["results"][0]["id"]
        
        # Make a request for a the full movie details
        single_movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}"

        # Execute "GET" request with url
        movie_response = requests.get(single_movie_url)
        movie_response = movie_response.json()
        print(json.dumps(movie_response, indent=4))
        # Extract the genre names into a list
        genre_names = movie_response["genres"]

        # Extract the spoken_languages' English name into a list
        language_names = movie_response["spoken_languages"]["english_name"]

        # Extract the production_countries' name into a list
        production_country = movie_response["production_countries"]["name"]

        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list
        movie_data = {genre_names, language_names, production_country}
        tmdb_movies_list.append(movie_data)

    except KeyError as e:
    # Handling the KeyError
        print(f"KeyError: {e}. The key does not exist in the dictionary.")
    except IndexError as e:
        print(f"IndexError: {e}. The index does not exist.")
    except Exception as e:
        print("Movie not found")
        # Print out the title that was found
    
    



IndexError: list index out of range. The index does not exist.
IndexError: list index out of range. The index does not exist.
IndexError: list index out of range. The index does not exist.
IndexError: list index out of range. The index does not exist.
{
    "adult": false,
    "backdrop_path": "/evGlZtheDf2sDZZA5Vg8RzgES02.jpg",
    "belongs_to_collection": null,
    "budget": 0,
    "genres": [
        {
            "id": 18,
            "name": "Drama"
        },
        {
            "id": 10749,
            "name": "Romance"
        }
    ],
    "homepage": "",
    "id": 12652,
    "imdb_id": "tt0113947",
    "original_language": "fr",
    "original_title": "Nelly et Mr. Arnaud",
    "overview": "Nelly leaves her lazy, unemployed husband to work for retired judge Mr Arnaud, forty years her senior, after he offers to clear her bills for her. While she types his memoirs the two develop a close friendship, but Arnaud becomes jealous when Nelly begins dating his good-looking young publ

In [12]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [13]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [14]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [15]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [16]:
# Drop "byline.person" column


In [17]:
# Delete duplicate rows and reset index


In [18]:
# Export data to CSV without the index
