In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import time

2. Gather Movie Data via TMDB API  
    a. Set up the API    
    * Create a free [TMDB account](https://developer.themoviedb.org/docs/getting-started)  
    * Generate an API key are review their documentation, especially:  
        * /discover/movie  
        * /movie/{movie_id}  
        * /search/movie  
    b. Collect top movies (2015-2024)  
    For each year from 2015 to 2024:  
        * Query TMDB for the top 100 movies (by vote count).  
        * For each movie, gather:  
            * Title  
            * Release Year  
            * Genre(s)  
            * Vote Average  
            * Vote Count  
            * Budget  
            * Revenue  
            * TMDB ID  
        * Store all results in a single DataFrame and export to movies_2015_2024.csv.
        * Hint: TMDB rate limits are generous for free accounts, but you should pause between requests (eg. time.sleep(0.25)). 
        * Some Oscar films may not appear in the top 100 by vote count. For any missing, use the /search/movie endpoint to add it.  

In [2]:
import json
with open('../keys.json') as fi:
    credentials = json.load(fi)
api_key = credentials['api_key']

In [3]:
endpoint = 'https://api.themoviedb.org/3/discover/movie'

In [4]:
movie_data = pd.DataFrame()
pd.set_option('display.max_colwidth', None)

for release_year in range(2015,2025):
    yearly_movies = pd. DataFrame()
    for page in range(1,6):
        params = {
        'page' :page,
        'api_key' : api_key,
        'sort_by' : 'vote_count.desc',
        'release_date.gte' : f"{release_year}-01-01",
        'release_date.lte' : f"{release_year}-12-31"
        }
        api_response = requests.get(endpoint, params=params)

        data = api_response.json()
        data
        
        page_data= pd.json_normalize(data['results'])
        yearly_movies = pd.concat([page_data,yearly_movies], ignore_index=True)
        
        yearly_movies["release_year"] = pd.to_datetime(yearly_movies["release_date"], errors="coerce").dt.year
        yearly_movies["release_year"] = release_year
        movie_data= pd.concat([yearly_movies,movie_data], ignore_index=True)
        
        print(movie_data)

    adult                     backdrop_path                genre_ids      id  \
0   False  /5XNQBqnBwPA9yT0jZ0p3s8bbLh0.jpg            [12, 18, 878]  157336   
1   False  /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg            [878, 28, 12]   24428   
2   False  /5TiwfWEaPSwD20uwXjCTUqpQX70.jpg                 [18, 53]     550   
3   False  /suaEOtk1N1sgg2MTM7oZd2cfVp3.jpg             [53, 80, 35]     680   
4   False  /v8xVDqt8uCul3c3mgx4VpGCwxJC.jpg                 [18, 80]     278   
5   False  /uLtVbjvS1O7gXL8lUOwsFOH4man.jpg            [28, 878, 12]  118340   
6   False  /67HggiWaP9ZLv5sPYmyRV37yAJM.jpg          [35, 18, 10749]      13   
7   False  /5Lbm0gpFDRAPIV1Cth6ln9iL1ou.jpg                 [18, 37]   68718   
8   False  /kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg            [28, 12, 878]   99861   
9   False  /y2DB71C4nyIdMrANijz8mzvQtk6.jpg         [28, 80, 18, 53]   49026   
10  False  /gqrnQA6Xppdl8vIb2eJc58VC1tW.jpg            [28, 12, 878]   76341   
11  False  /hwNtEmmugU5Yd7hpfprNWI0DGIn.

In [5]:
new_movie_data=movie_data[['title','genre_ids','vote_average','vote_count','release_year','id']]
new_movie_data.head()

Unnamed: 0,title,genre_ids,vote_average,vote_count,release_year,id
0,Dawn of the Planet of the Apes,"[878, 28, 18, 53]",7.335,11898,2024,119450
1,In Time,"[28, 53, 878]",6.971,11837,2024,49530
2,American Psycho,"[53, 18, 80]",7.409,11717,2024,1359
3,Despicable Me 2,"[16, 35, 10751]",6.937,11705,2024,93456
4,Aladdin,"[16, 10751, 12, 14, 10749]",7.654,11679,2024,812


In [6]:
movie_id = 1198426

endpoint1 = f'https://api.themoviedb.org/3/movie/{movie_id}'
params = {
        'page' :page,
        'api_key' : api_key,
        
        }
movie_response = requests.get(endpoint1,params)
movie_response.json()

{'adult': False,
 'backdrop_path': '/4damV6u8Za9p03SH9jvKr3TwHQC.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 16, 'name': 'Animation'}, {'id': 10751, 'name': 'Family'}],
 'homepage': '',
 'id': 1198426,
 'imdb_id': 'tt17423376',
 'origin_country': ['DO'],
 'original_language': 'es',
 'original_title': 'Capitán Avispa',
 'overview': "In Avispatropolis, the fearless Captain Avispa emerges as an exemplary protector of the city, whose courage and convictions always prevail over the forces of evil. His unbreakable power is nourished by absolute sincerity, as he would lose his strength if he dared to weave deceit. Only in pursuit of noble causes does he allow himself to deviate from the truth. As is common in stories of this kind, Captain Wasp is surrounded by a constellation of close friends and arch-enemies, whose stories have their origins in the artist's songs.",
 'popularity': 27.5096,
 'poster_path': '/zmthz3CuFljmBQcfuaz4hBNwbQ0.jpg',
 'production_companies':

In [12]:
all_movie_data = pd.DataFrame()
for movie_id in movie_data['id'].head():
    tmdb_endpoint = f'https://api.themoviedb.org/3/movie/{movie_id}' 
    params = {
        'api_key' : api_key,
        'sort_by' : 'vote_count.desc',
      
        }

    tmdb_response = requests.get(tmdb_endpoint,params)
    tmdb_data = tmdb_response.json()
    tmdb_df = pd.json_normalize(tmdb_data)
    all_movie_data = pd.concat([all_movie_data,tmdb_df], ignore_index=True)
    
    time.sleep(0.25)

second_movie_data_api_call = all_movie_data[['title','vote_average','vote_count','budget','revenue','imdb_id']]
second_movie_data_api_call.head()

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id
0,Dawn of the Planet of the Apes,7.335,11898,170000000,710644566,tt2103281
1,In Time,6.971,11837,40000000,173900000,tt1637688
2,American Psycho,7.409,11716,7000000,34300000,tt0144084
3,Despicable Me 2,6.937,11705,76000000,970766005,tt1690953
4,Aladdin,7.654,11679,28000000,504050219,tt0103639


**Optional Extension: Actors and Actresses** 

1. Scrape Wikipedia for Best Actor and Best Actress Data
    * Scrape the following Wikipedia pages:  
        * [Best Actor](https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor)
        * [Best Actress](https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress)
    * Each apge contains tables of winners and nominees by year.
    * Extract the following columns:  
        * Year
        * Actor/Actress Name
        * Film Title
        * Winner (Yes/No)
    * Data cleaning tips:  
        * Remove footnote markers from names and movie titles.
        * Ensure that you save just the release year (eg. 2009 instead of 2009 (82nd))
        * Store the cleaned data as two csv files:  
            * best_actor.csv
            * best_actress.csv  

2. Collect Actor and Actress Filmographies  
    Using the data from your actor and actresses CSVs:  
    * Search TMDB for each recent performer (using /search/person). Note: you can start with 2015-2024 initially, but, if time allows, you can go back even further.
    * For each person, retrieve their movie credits using /person/{person_id}/movie_credits.  
    * Extract relevant fields for each movie, such as:  
        * Actor/Actress Name  
        * Movie Title  
        * Character Name (optional)  
        * Release Year  
        * Movie ID
    * Combine all filmographies into one file, actor_filmography.csv