### Import Required Libraries and Set Up Environment Variables

In [3]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [4]:
# Set environment variables from the .env in the local environment
load_dotenv("example.env")

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [5]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL 
query_url = url + "api-key=" + nyt_api_key + "&fq=" + filter_query + "&sort=" + sort + "&begin_date=" + begin_date + "&end_date=" + end_date
response = requests.get(query_url).json()
review_list = response["response"]["docs"]
print(json.dumps(review_list, indent=4))

[
    {
        "abstract": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
        "web_url": "https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html",
        "snippet": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
        "lead_paragraph": "A trashy treat coated in a high-art gloss, \u201cThe Attachment Diaries\u201d gleefully kneads melodrama, noir, horror and sexual perversion into a pathological romance between two deeply damaged women.",
        "print_section": "C",
        "print_page": "7",
        "source": "The New York Times",
        "multimedia": [
            {
                "rank": 0,
                "subtype": "xlarge",
                "caption": null,
                "credit": null,
                "type": "image",
                "url": "images/2023/05/26/multimedia/attachment1-mbcw/attachment1-mbcw-articl

In [6]:
# Create an empty list to store the reviews
reviews = []
page = 0
# loop through pages 0-19
while page <= 19:
    # create query with a page number
    page_query_url = f"{query_url}&page={page}"
    
    # API results show 10 articles at a time
    
    
    
    # Make a "GET" request and retrieve the JSON
    response = requests.get(page_query_url).json()
    
    # Add a twelve second interval between queries to stay within API query limits
    reviews.append(response)
    time.sleep(12)
    
    # Try and save the reviews to the reviews_list
    try:
        # loop through the reviews["response"]["docs"] and append each review to the list
        for review in response["response"]["docs"]:
            reviews.append(review)
            # Print the page that was just retrieved
            print(f"Retrieved review for page {page}")
    except:
        # Print the page number that had no results then break from the loop
        print(f"No reviews found for page {page}")
        break
        
    page += 1

        


Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 0
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 1
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 2
Retrieved review for page 3
Retrieved review for page 3
Retrieved review for page 3
Retrieved review for page 3
Retrieved review for page 3
Retrieved review for

In [7]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(reviews[:5], indent=4))

[
    {
        "status": "OK",
        "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
        "response": {
            "docs": [
                {
                    "abstract": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
                    "web_url": "https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html",
                    "snippet": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
                    "lead_paragraph": "A trashy treat coated in a high-art gloss, \u201cThe Attachment Diaries\u201d gleefully kneads melodrama, noir, horror and sexual perversion into a pathological romance between two deeply damaged women.",
                    "print_section": "C",
                    "print_page": "7",
                    "source": "The New York Times",
                    "multimedia

In [8]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
df = pd.json_normalize(reviews)
df = pd.DataFrame(df)
df.head()

Unnamed: 0,status,copyright,response.docs,response.meta.hits,response.meta.offset,response.meta.time,abstract,web_url,snippet,lead_paragraph,...,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,OK,Copyright (c) 2024 The New York Times Company....,[{'abstract': 'A gynecologist and her patient ...,344.0,0.0,18.0,,,,,...,,,,,,,,,,
1,,,,,,,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",...,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
2,,,,,,,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,...,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
3,,,,,,,Religion comes between two girls falling in lo...,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,"In “You Can Live Forever,” Jaime and Marike do...",...,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
4,,,,,,,Rachael Leigh Cook stars in this bland rom-com...,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The first thing we learn about Amanda (Rachael...,...,‘A Tourist’s Guide to Love’ Review: A Wearying...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",


In [84]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
df["title"] = df["headline.main"].rename()

# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early
def transform_title(title):
    #Start and end index
    start_index = title.find("\u2018")
    end_index = title.find("\u2019")
    #extract the title
    extracted_title = title[start_index+1:end_index]
    transform_title = extracted_title + " Review"
    return transform_title

df["title"] = df["title"].apply(transform_title)



AttributeError: 'float' object has no attribute 'find'

In [None]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string


In [None]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database


### Access The Movie Database API

In [None]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

In [None]:
# Create an empty list to store the results


# Create a request counter to sleep the requests after a multiple
# of 50 requests


# Loop through the titles

    # Check if we need to sleep before making a request


    # Add 1 to the request counter

    
    # Perform a "GET" request for The Movie Database


    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.

        # Get movie id


        # Make a request for a the full movie details


        # Execute "GET" request with url

        
        # Extract the genre names into a list


        # Extract the spoken_languages' English name into a list


        # Extract the production_countries' name into a list


        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found



In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [None]:
# Drop "byline.person" column


In [None]:
# Delete duplicate rows and reset index


In [None]:
# Export data to CSV without the index
