### Import Required Libraries and Set Up Environment Variables

In [1]:
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [2]:
# Set environment variables from the .env in the local environment
load_dotenv()

NYT_API_KEY= os.getenv('NYT_API_KEY')
TMDB_API_KEY= os.getenv('TMDB_API_KEY')



### Access the New York Times API

In [3]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "(headline,web_url,snippet,source,keywords,pub_date,byline,word_count)"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
# Build parameters dictionary
params = {
    'fq': filter_query,
    'sort': sort,
    'field-name': field_list,
    'begin_date': begin_date,
    'end_date': end_date,
    'api-key': NYT_API_KEY  
}

# Make the request
response = requests.get(url, params=params)
print(response.json())


{'status': 'OK', 'copyright': 'Copyright (c) 2024 The New York Times Company. All Rights Reserved.', 'response': {'docs': [{'abstract': 'A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.', 'web_url': 'https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html', 'snippet': 'A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.', 'lead_paragraph': 'A trashy treat coated in a high-art gloss, “The Attachment Diaries” gleefully kneads melodrama, noir, horror and sexual perversion into a pathological romance between two deeply damaged women.', 'print_section': 'C', 'print_page': '7', 'source': 'The New York Times', 'multimedia': [{'rank': 0, 'subtype': 'xlarge', 'caption': None, 'credit': None, 'type': 'image', 'url': 'images/2023/05/26/multimedia/attachment1-mbcw/attachment1-mbcw-articleLarge.jpg', 'height': 296, 'width': 600, 'legacy': {'xlarge': 'imag

In [4]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
for page in range(20):
    # create query with a page number
    # API results show 10 articles at a time
    params['page'] = page
    response = requests.get(url, params=params)
    
    # Make a "GET" request and retrieve the JSON
    data = response.json()
    
    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)
    
    # Try and save the reviews to the reviews_list
    reviews_list.extend(data['response']['docs'])
        # loop through the reviews["response"]["docs"] and append each review to the list
    for review in data['response']['docs']:
        reviews_list.append(review)
        # Print the page that was just retrieved
    print(f"Page {page + 1} retrieved successfully.")

        # Print the page number that had no results then break from the loop
    if not data['response']['docs']:
        print(f"No results found on page {page + 1}. Exiting loop.")
        break    

Page 1 retrieved successfully.
Page 2 retrieved successfully.
Page 3 retrieved successfully.
Page 4 retrieved successfully.
Page 5 retrieved successfully.
Page 6 retrieved successfully.
Page 7 retrieved successfully.
Page 8 retrieved successfully.
Page 9 retrieved successfully.
Page 10 retrieved successfully.
Page 11 retrieved successfully.
Page 12 retrieved successfully.
Page 13 retrieved successfully.
Page 14 retrieved successfully.
Page 15 retrieved successfully.
Page 16 retrieved successfully.
Page 17 retrieved successfully.
Page 18 retrieved successfully.
Page 19 retrieved successfully.
Page 20 retrieved successfully.


In [11]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(reviews_list[:5], indent=4))

[
    {
        "abstract": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
        "web_url": "https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html",
        "snippet": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
        "lead_paragraph": "A trashy treat coated in a high-art gloss, \u201cThe Attachment Diaries\u201d gleefully kneads melodrama, noir, horror and sexual perversion into a pathological romance between two deeply damaged women.",
        "print_section": "C",
        "print_page": "7",
        "source": "The New York Times",
        "multimedia": [
            {
                "rank": 0,
                "subtype": "xlarge",
                "caption": null,
                "credit": null,
                "type": "image",
                "url": "images/2023/05/26/multimedia/attachment1-mbcw/attachment1-mbcw-articl

In [12]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
from pandas import json_normalize

df_reviews = json_normalize(reviews_list) 
#df_reviews = pd.DataFrame(reviews_list)
df_reviews.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-25T11:00:03+0000,...,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
1,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,C,8,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T17:16:45+0000,...,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
2,Religion comes between two girls falling in lo...,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,"In “You Can Live Forever,” Jaime and Marike do...",C,9,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T11:00:08+0000,...,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
3,Rachael Leigh Cook stars in this bland rom-com...,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The first thing we learn about Amanda (Rachael...,C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-21T07:03:25+0000,...,‘A Tourist’s Guide to Love’ Review: A Wearying...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
4,A radiant Virginie Efira stars as a Parisian t...,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,When a woman falls in love in the sensitive Fr...,C,4,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-20T15:35:13+0000,...,‘Other People’s Children’ Review: True Romance,Critic’s pick,,Intoxicating Love With a Sobering Turn,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",


In [13]:
# Extract the title from the "headline.main" column and

# save it to a new column "title"

# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early

# Define a function to extract the title
def extract_title(text):
    start_index = text.find('\u2018')
    end_index = text.find('\u2019')
    if start_index != -1 and end_index != -1:
        title = text[start_index + 1:end_index]
        if not title.endswith(" Review"):
            title += " Review"
        return title

# Extract the title and save it to a new column "title" in df_reviews
df_reviews['title'] = df_reviews['headline.main'].apply(extract_title)

# Display the DataFrame to verify the changes
df_reviews.head()


Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-25T11:00:03+0000,...,,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review
1,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,C,8,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T17:16:45+0000,...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,What Review
2,Religion comes between two girls falling in lo...,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,"In “You Can Live Forever,” Jaime and Marike do...",C,9,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T11:00:08+0000,...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,You Can Live Forever Review
3,Rachael Leigh Cook stars in this bland rom-com...,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The first thing we learn about Amanda (Rachael...,C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-21T07:03:25+0000,...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,A Tourist Review
4,A radiant Virginie Efira stars as a Parisian t...,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,When a woman falls in love in the sensitive Fr...,C,4,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-20T15:35:13+0000,...,Critic’s pick,,Intoxicating Love With a Sobering Turn,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",,Other People Review


In [14]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']}; " 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords.strip()

# Fix the "keywords" column by converting cells from a list to a string
df_reviews['keywords'] = df_reviews['keywords'].apply(extract_keywords)

# Display the DataFrame to verify the changes
df_reviews.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review
1,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,C,8,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","subject: Movies; persons: Kapur, Shekhar; pers...",2023-05-04T17:16:45+0000,...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,What Review
2,Religion comes between two girls falling in lo...,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,"In “You Can Live Forever,” Jaime and Marike do...",C,9,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: You Can Live ...,2023-05-04T11:00:08+0000,...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,You Can Live Forever Review
3,Rachael Leigh Cook stars in this bland rom-com...,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The first thing we learn about Amanda (Rachael...,C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: A Tourist's G...,2023-04-21T07:03:25+0000,...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,A Tourist Review
4,A radiant Virginie Efira stars as a Parisian t...,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,When a woman falls in love in the sensitive Fr...,C,4,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","subject: Movies; persons: Zlotowski, Rebecca; ...",2023-04-20T15:35:13+0000,...,Critic’s pick,,Intoxicating Love With a Sobering Turn,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",,Other People Review


In [15]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
titles_list = df_reviews['title'].to_list()

# Display the list to verify the extracted titles
print(titles_list)

['The Attachment Diaries Review', 'What Review', 'You Can Live Forever Review', 'A Tourist Review', 'Other People Review', 'One True Loves Review', 'The Lost Weekend: A Love Story Review', 'A Thousand and One Review', 'Your Place or Mine Review', 'Love in the Time of Fentanyl Review', 'The Attachment Diaries Review', 'What Review', 'You Can Live Forever Review', 'A Tourist Review', 'Other People Review', 'One True Loves Review', 'The Lost Weekend: A Love Story Review', 'A Thousand and One Review', 'Your Place or Mine Review', 'Love in the Time of Fentanyl Review', 'Pamela, a Love Story Review', 'In From the Side Review', 'After Love Review', 'Alcarràs Review', 'Nelly & Nadine Review', 'Lady Chatterley Review', 'The Sound of Christmas Review', 'The Inspection Review', 'Bones and All Review', 'My Policeman Review', 'Pamela, a Love Story Review', 'In From the Side Review', 'After Love Review', 'Alcarràs Review', 'Nelly & Nadine Review', 'Lady Chatterley Review', 'The Sound of Christmas Re

### Access The Movie Database API

In [17]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + TMDB_API_KEY

In [19]:

# Create an empty list to store the results
tmdb_movies_list = []

# Create a request counter to sleep the requests after a multiple of 50 requests
request_counter = 0

# Loop through the titles
for title in titles_list:
    # Check if we need to sleep before making a request
    if request_counter > 0 and request_counter % 50 == 0:
        time.sleep(12)  # Sleep for 12 seconds after every 50 requests

    # Add 1 to the request counter
    request_counter += 1
    
    # Remove " Review" from the title to improve search results
    search_title = title.replace(" Review", "")
    
    # Perform a "GET" request for The Movie Database
    response = requests.get(f"{url}{search_title}{tmdb_key_string}")
    
    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie is not found.
    try:
        results = response.json().get('results', [])
        if results:
            # Get movie id
            movie_id = results[0]['id']

            # Make a request for the full movie details
            details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={TMDB_API_KEY}"
            
            # Execute "GET" request with url
            details_response = requests.get(details_url)
            movie_details = details_response.json()
            
            # Extract the genre names into a list
            genres = [genre['name'] for genre in movie_details['genres']]
            
            # Extract the spoken_languages' English name into a list
            spoken_languages = [language['english_name'] for language in movie_details['spoken_languages']]
            
            # Extract the production_countries' name into a list
            production_countries = [country['name'] for country in movie_details['production_countries']]
            
            # Add the relevant data to a dictionary and append it to the tmdb_movies_list list
            tmdb_movies_list.append({
                'title': title,
                'genres': genres,
                'spoken_languages': spoken_languages,
                'production_countries': production_countries,
                'release_date': movie_details.get('release_date', 'N/A'),
                'runtime': movie_details.get('runtime', 'N/A'),
                'vote_average': movie_details.get('vote_average', 'N/A')
            })
            
            # Print out the title that was found
            print(f"Found details for: {title}")
        else:
            print(f"Movie not found: {title}")

    except (IndexError, KeyError):
        print(f"Movie not found: {title}")

Found details for: The Attachment Diaries Review
Found details for: What Review
Found details for: You Can Live Forever Review
Found details for: A Tourist Review
Found details for: Other People Review
Found details for: One True Loves Review
Found details for: The Lost Weekend: A Love Story Review
Found details for: A Thousand and One Review
Found details for: Your Place or Mine Review
Found details for: Love in the Time of Fentanyl Review
Found details for: The Attachment Diaries Review
Found details for: What Review
Found details for: You Can Live Forever Review
Found details for: A Tourist Review
Found details for: Other People Review
Found details for: One True Loves Review
Found details for: The Lost Weekend: A Love Story Review
Found details for: A Thousand and One Review
Found details for: Your Place or Mine Review
Found details for: Love in the Time of Fentanyl Review
Found details for: Pamela, a Love Story Review
Found details for: In From the Side Review
Found details for: A

AttributeError: 'NoneType' object has no attribute 'replace'

In [20]:
# Preview the first 5 results in JSON format
formatted_results = json.dumps(tmdb_movies_list[:5], indent=4)

# Print the formatted JSON results
print(formatted_results)

[
    {
        "title": "The Attachment Diaries Review",
        "genres": [
            "Drama",
            "Mystery",
            "Thriller",
            "Horror"
        ],
        "spoken_languages": [
            "Spanish"
        ],
        "production_countries": [
            "Argentina"
        ],
        "release_date": "2021-10-07",
        "runtime": 102,
        "vote_average": 3.0
    },
    {
        "title": "What Review",
        "genres": [
            "Comedy"
        ],
        "spoken_languages": [],
        "production_countries": [
            "United States of America"
        ],
        "release_date": "2022-04-08",
        "runtime": 100,
        "vote_average": 5.9
    },
    {
        "title": "You Can Live Forever Review",
        "genres": [
            "Drama",
            "Romance"
        ],
        "spoken_languages": [
            "English",
            "French"
        ],
        "production_countries": [
            "Canada",
            "United S

In [24]:
# Convert the results to a DataFrame
df_tmdb_movies = pd.DataFrame(tmdb_movies_list)

# Display the first few rows of the DataFrame to verify the conversion
df_tmdb_movies

Unnamed: 0,title,genres,spoken_languages,production_countries,release_date,runtime,vote_average
0,The Attachment Diaries Review,"[Drama, Mystery, Thriller, Horror]",[Spanish],[Argentina],2021-10-07,102,3.000
1,What Review,[Comedy],[],[United States of America],2022-04-08,100,5.900
2,You Can Live Forever Review,"[Drama, Romance]","[English, French]","[Canada, United States of America]",2023-03-24,96,6.550
3,A Tourist Review,"[Romance, Comedy]","[English, Vietnamese]",[United States of America],2023-04-21,96,6.303
4,Other People Review,"[Comedy, Drama]",[English],[United States of America],2016-09-09,97,6.312
...,...,...,...,...,...,...,...
355,I Do ... Until I Don Review,[Comedy],[English],[United States of America],2017-09-01,103,4.985
356,Tales of an Immoral Couple Review,"[Comedy, Romance]",[Spanish],[Mexico],2016-10-28,91,6.500
357,After Love Review,[Drama],"[English, Arabic, French, Urdu]",[United Kingdom],2021-06-04,89,7.164
358,"Women Who Kill, Review","[Comedy, Crime, Mystery, Romance, Thriller]",[English],[United States of America],2016-07-27,93,4.900


### Merge and Clean the Data for Export

In [26]:
# Merge the New York Times reviews DataFrame (df_reviews) and TMDB DataFrame (df_tmdb_movies) on the 'title' column
merged_df = pd.merge(df_reviews, df_tmdb_movies, on='title', how='inner')

# Display the first few rows of the merged DataFrame to verify the merge
merged_df.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,byline.original,byline.person,byline.organization,title,genres,spoken_languages,production_countries,release_date,runtime,vote_average
0,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"[Drama, Mystery, Thriller, Horror]",[Spanish],[Argentina],2021-10-07,102,3.0
1,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"[Drama, Mystery, Thriller, Horror]",[Spanish],[Argentina],2021-10-07,102,3.0
2,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"[Drama, Mystery, Thriller, Horror]",[Spanish],[Argentina],2021-10-07,102,3.0
3,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"[Drama, Mystery, Thriller, Horror]",[Spanish],[Argentina],2021-10-07,102,3.0
4,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,C,8,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","subject: Movies; persons: Kapur, Shekhar; pers...",2023-05-04T17:16:45+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,What Review,[Comedy],[],[United States of America],2022-04-08,100,5.9


In [27]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing (that contain lists)
columns_to_fix = ['genres', 'spoken_languages', 'production_countries']

# Create a list of characters to remove
chars_to_remove = ["[", "]", "'"]

# Loop through the list of columns to fix
for column in columns_to_fix:
    # Convert the column to type 'str' if it's not already a string
    merged_df[column] = merged_df[column].astype(str)
    
    # Loop through characters to remove
    for char in chars_to_remove:
        merged_df[column] = merged_df[column].str.replace(char, "")
        
# Display the fixed DataFrame
merged_df.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,byline.original,byline.person,byline.organization,title,genres,spoken_languages,production_countries,release_date,runtime,vote_average
0,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
1,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
2,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
3,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
4,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,C,8,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","subject: Movies; persons: Kapur, Shekhar; pers...",2023-05-04T17:16:45+0000,...,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,What Review,Comedy,,United States of America,2022-04-08,100,5.9


In [28]:
# Drop the "byline.person" column
merged_df = merged_df.drop(columns=['byline.person'])

# Display the first few rows of the DataFrame to verify the column is dropped
merged_df.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,headline.sub,byline.original,byline.organization,title,genres,spoken_languages,production_countries,release_date,runtime,vote_average
0,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,,By Jeannette Catsoulis,,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
1,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,,By Jeannette Catsoulis,,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
2,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,,By Jeannette Catsoulis,,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
3,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,,By Jeannette Catsoulis,,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
4,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,C,8,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","subject: Movies; persons: Kapur, Shekhar; pers...",2023-05-04T17:16:45+0000,...,,By Jeannette Catsoulis,,What Review,Comedy,,United States of America,2022-04-08,100,5.9


In [41]:
#Print unique data types in each column
for col in merged_df.columns:
    print(f"Column '{col}' unique types:", merged_df[col].map(type).unique())

Column 'abstract' unique types: [<class 'str'>]
Column 'web_url' unique types: [<class 'str'>]
Column 'snippet' unique types: [<class 'str'>]
Column 'lead_paragraph' unique types: [<class 'str'>]
Column 'print_section' unique types: [<class 'str'> <class 'float'>]
Column 'print_page' unique types: [<class 'str'> <class 'float'>]
Column 'source' unique types: [<class 'str'>]
Column 'multimedia' unique types: [<class 'tuple'>]
Column 'keywords' unique types: [<class 'str'>]
Column 'pub_date' unique types: [<class 'str'>]
Column 'document_type' unique types: [<class 'str'>]
Column 'news_desk' unique types: [<class 'str'>]
Column 'section_name' unique types: [<class 'str'>]
Column 'type_of_material' unique types: [<class 'str'>]
Column '_id' unique types: [<class 'str'>]
Column 'word_count' unique types: [<class 'int'>]
Column 'uri' unique types: [<class 'str'>]
Column 'headline.main' unique types: [<class 'str'>]
Column 'headline.kicker' unique types: [<class 'NoneType'> <class 'str'>]
Co

In [44]:
# Convert mixed-type columns to strings
columns_to_convert = ['print_section', 'print_page', 'headline.kicker', 'headline.print_headline', 
                      'byline.organization', 'headline.content_kicker', 'headline.name', 
                      'headline.seo', 'headline.sub']

for col in columns_to_convert:
    merged_df[col] = merged_df[col].astype(str)

# Convert `NoneType` to empty strings
merged_df = merged_df.fillna('')

# Convert tuples to strings in 'multimedia' column
merged_df['multimedia'] = merged_df['multimedia'].apply(str)

# Delete duplicate rows
merged_df = merged_df.drop_duplicates()

# Reset the index
merged_df = merged_df.reset_index(drop=True)

# Display the first few rows of the DataFrame to verify the changes
merged_df.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,headline.sub,byline.original,byline.organization,title,genres,spoken_languages,production_countries,release_date,runtime,vote_average
0,A gynecologist and her patient form a horrifyi...,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,"A trashy treat coated in a high-art gloss, “Th...",C,7,The New York Times,"({'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: The Attachmen...,2023-05-25T11:00:03+0000,...,,By Jeannette Catsoulis,,The Attachment Diaries Review,"Drama, Mystery, Thriller, Horror",Spanish,Argentina,2021-10-07,102,3.0
1,Two childhood friends navigate cultural differ...,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,A glossy lesson in how to pour nontraditional ...,C,8,The New York Times,"({'rank': 0, 'subtype': 'xlarge', 'caption': N...","subject: Movies; persons: Kapur, Shekhar; pers...",2023-05-04T17:16:45+0000,...,,By Jeannette Catsoulis,,What Review,Comedy,,United States of America,2022-04-08,100,5.9
2,Religion comes between two girls falling in lo...,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,"In “You Can Live Forever,” Jaime and Marike do...",C,9,The New York Times,"({'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: You Can Live ...,2023-05-04T11:00:08+0000,...,,By Elisabeth Vincentelli,,You Can Live Forever Review,"Drama, Romance","English, French","Canada, United States of America",2023-03-24,96,6.55
3,Rachael Leigh Cook stars in this bland rom-com...,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The first thing we learn about Amanda (Rachael...,C,7,The New York Times,"({'rank': 0, 'subtype': 'xlarge', 'caption': N...",subject: Movies; creative_works: A Tourist's G...,2023-04-21T07:03:25+0000,...,,By Elisabeth Vincentelli,,A Tourist Review,"Romance, Comedy","English, Vietnamese",United States of America,2023-04-21,96,6.303
4,A radiant Virginie Efira stars as a Parisian t...,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,When a woman falls in love in the sensitive Fr...,C,4,The New York Times,"({'rank': 0, 'subtype': 'xlarge', 'caption': N...","subject: Movies; persons: Zlotowski, Rebecca; ...",2023-04-20T15:35:13+0000,...,,By Manohla Dargis,,Other People Review,"Comedy, Drama",English,United States of America,2016-09-09,97,6.312


In [45]:
# Export data to CSV without the index
merged_df.to_csv('cleaned_data.csv', index=False)

print("Data successfully exported to 'cleaned_data.csv'")

Data successfully exported to 'cleaned_data.csv'
