# Part 1: Data Gathering
<b>Gather Movie Data via TMDB API.</b>
<br>
<br>
a. Set up the API
Create a free TMDB account
Generate an API key are review their documentation, especially:
* /discover/movie
* /movie/{movie_id}
* /search/movie

b. Collect top movies (2015-2024)
<br>For each year from 2015 to 2024: Query TMDB for the top 100 movies (by vote count).
<br>For each movie, gather:
* Title
* Release Year
* Genre(s)
* Vote Average
* Vote Count
* Budget
* Revenue
* TMDB ID
<br>Store all results in a single DataFrame and export to movies_2015_2024.csv.
<br>Hint: TMDB rate limits are generous for free accounts, but you should pause between requests (eg. time.sleep(0.25)).
<br>Some Oscar films may not appear in the top 100 by vote count. For any missing, use the /search/movie endpoint to add it.

In [1]:
# IMPORT BUILT-IN LIBRARIES
import ast
import json
import re
import requests
import time

# IMPORT 3RD-PARTY LIBRARIES
import pandas as pd

In [33]:
# SET BASE URL
base_url = "https://api.themoviedb.org/3"

# SET ENDPOINTS
auth_endpoint = "/authentication"
discover_endpoint = "/discover/movie"
details_endpoint = "/movie/movie_id" # Replace movie_id
search_endpoint = "/search/movie"
genre_endpoint = "/genre/movie/list"

In [3]:
# FETCH API KEY
with open("../config/api_key.txt") as file:
    api_key = ast.literal_eval(file.read())

# SET HEADERS
headers = {
    "Authorization": f"Bearer {api_key['access_token']}",
    "accept": "application/json"
}

# GET AUTHENTICATION VALIDATION
auth_response = requests.get(url=base_url+auth_endpoint, headers=headers)

print(f"Auth response status code: {auth_response.status_code}")
print(f"Auth response text: {auth_response.text}")

Auth response status code: 200
Auth response text: {"success":true}


In [4]:
# CREATE A FUNCTION TO FETCH TOP 100 MOVIES BY YEAR
def get_top_100_movies_by_year(year: str) -> list:
    print(f"YEAR: {year}")

    # INITIALIZE DISCOVER LIST
    discover_list = []
    
    # ITERATE THE PAGES TO GET 100 MOVIES PER YEAR
    for page_number in range(1, 10):
        
        print(f"LIST LENGTH: {len(discover_list)}")
        
        # IF THE LENGTH OF THE DISCOVER LIST IS LESS THAN 100, GET NEXT PAGE
        if len(discover_list) < 100:

            print(f"PAGE: {page_number}")
            
            # SET DISCOVER PARAMS
            discover_params = {
                "sort_by": "vote_count.desc",
                "primary_release_year": year,
                "page": page_number
            }
            
            # SEND REQUEST TO DISCOVER ENDPOINT
            discover_response = requests.get(url=base_url+discover_endpoint, headers=headers, params=discover_params)

            # IF SUCCESSFUL REQUEST...
            if discover_response.status_code == 200:

                # APPEND RESULTS TO MOVIE LIST
                discover_json = discover_response.json()
                discover_list = discover_list + discover_json.get("results", [])

            # IF FAILED REQUEST...
            else:

                # RAISE EXCEPTION
                raise Exception

            # WAIT BETWEEN REQUESTS
            time.sleep(0.25)

        # IF THE LENGTH OF THE MOVIE LIST IS 100 OR GREATER, STORE LIST IN MOVIE DICT
        else:
            return discover_list

In [34]:
# GET TOP 100 MOVIES PER YEAR (2015-2024)

# INITIALIZE DISCOVER DICT
discover_dict = {}

# LOOP THE REQUIRED RANGE OF YEARS
for year in range(2015, 2025):
    discover_dict[year] = get_top_100_movies_by_year(year=year)

YEAR: 2015
LIST LENGTH: 0
PAGE: 1
LIST LENGTH: 20
PAGE: 2
LIST LENGTH: 40
PAGE: 3
LIST LENGTH: 60
PAGE: 4
LIST LENGTH: 80
PAGE: 5
LIST LENGTH: 100
YEAR: 2016
LIST LENGTH: 0
PAGE: 1
LIST LENGTH: 20
PAGE: 2
LIST LENGTH: 40
PAGE: 3
LIST LENGTH: 60
PAGE: 4
LIST LENGTH: 80
PAGE: 5
LIST LENGTH: 100
YEAR: 2017
LIST LENGTH: 0
PAGE: 1
LIST LENGTH: 20
PAGE: 2
LIST LENGTH: 40
PAGE: 3
LIST LENGTH: 60
PAGE: 4
LIST LENGTH: 80
PAGE: 5
LIST LENGTH: 100
YEAR: 2018
LIST LENGTH: 0
PAGE: 1
LIST LENGTH: 20
PAGE: 2
LIST LENGTH: 40
PAGE: 3
LIST LENGTH: 60
PAGE: 4
LIST LENGTH: 80
PAGE: 5
LIST LENGTH: 100
YEAR: 2019
LIST LENGTH: 0
PAGE: 1
LIST LENGTH: 20
PAGE: 2
LIST LENGTH: 40
PAGE: 3
LIST LENGTH: 60
PAGE: 4
LIST LENGTH: 80
PAGE: 5
LIST LENGTH: 100
YEAR: 2020
LIST LENGTH: 0
PAGE: 1
LIST LENGTH: 20
PAGE: 2
LIST LENGTH: 40
PAGE: 3
LIST LENGTH: 60
PAGE: 4
LIST LENGTH: 80
PAGE: 5
LIST LENGTH: 100
YEAR: 2021
LIST LENGTH: 0
PAGE: 1
LIST LENGTH: 20
PAGE: 2
LIST LENGTH: 40
PAGE: 3
LIST LENGTH: 60
PAGE: 4
LIST LENGTH:

In [6]:
# CREATE A DATAFRAME FROM THE MOVIE DICT
movie_df = (
    pd.DataFrame(data=discover_dict.items(), columns=['Year', 'Data'])
    .explode("Data")
    .reset_index(drop=True)
)
movie_df.head(2)

Unnamed: 0,Year,Data
0,2015,"{'adult': False, 'backdrop_path': '/kIBK5SKwgq..."
1,2015,"{'adult': False, 'backdrop_path': '/gqrnQA6Xpp..."


In [7]:
# CREATE A DATAFRAME FROM THE DATA COLUMN
data_df = pd.DataFrame(data=[row for row in movie_df['Data']])
data_df.head(2)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,"[28, 12, 878]",99861,en,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,11.8411,/4ssDuvEDkSArWEdyBl2X5EHvYKU.jpg,2015-04-22,Avengers: Age of Ultron,False,7.271,23847
1,False,/gqrnQA6Xppdl8vIb2eJc58VC1tW.jpg,"[28, 12, 878]",76341,en,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,10.6392,/hA2ple9q4qnwxp3hKVNhroipsir.jpg,2015-05-13,Mad Max: Fury Road,False,7.627,23503


In [8]:
# MERGE MOVIE AND DATA DATAFRAMES
movie_df = movie_df.merge(right=data_df, left_index=True, right_index=True)
movie_df.head(2)

Unnamed: 0,Year,Data,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,2015,"{'adult': False, 'backdrop_path': '/kIBK5SKwgq...",False,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,"[28, 12, 878]",99861,en,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,11.8411,/4ssDuvEDkSArWEdyBl2X5EHvYKU.jpg,2015-04-22,Avengers: Age of Ultron,False,7.271,23847
1,2015,"{'adult': False, 'backdrop_path': '/gqrnQA6Xpp...",False,/gqrnQA6Xppdl8vIb2eJc58VC1tW.jpg,"[28, 12, 878]",76341,en,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,10.6392,/hA2ple9q4qnwxp3hKVNhroipsir.jpg,2015-05-13,Mad Max: Fury Road,False,7.627,23503


In [9]:
# GET GENRE NAMES
genre_response = requests.get(url=base_url+genre_endpoint, headers=headers)
print(f"Genre response status code: {genre_response.status_code}")

Genre response status code: 200


In [17]:
# CONVERT GENRE RESPONSE TO JSON OBJECT AND EXTRACT THE LIST OF GENRES
genre_list = genre_response.json().get("genres")

In [18]:
# INITIALIZE GENRE DICT
genre_dict = {}

# CONVERT THE GENRE LIST TO A DICT WHERE THE KEY IS THE GENRE ID AND THE VALUE IS THE GENRE NAME
for genre in genre_list:
    genre_dict[genre.get("id")] = genre.get("name")

In [30]:
# CREATE A GENRE COLUMN CONVERTING THE GENRE IDS TO GENRE NAMES
movie_df['genres'] = movie_df['genre_ids'].apply(func=lambda genre_ids: [genre_dict.get(_id) for _id in genre_ids])
movie_df.head(2)

Unnamed: 0,Year,Data,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres
0,2015,"{'adult': False, 'backdrop_path': '/kIBK5SKwgq...",False,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,"[28, 12, 878]",99861,en,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,11.8411,/4ssDuvEDkSArWEdyBl2X5EHvYKU.jpg,2015-04-22,Avengers: Age of Ultron,False,7.271,23847,"[Action, Adventure, Science Fiction]"
1,2015,"{'adult': False, 'backdrop_path': '/gqrnQA6Xpp...",False,/gqrnQA6Xppdl8vIb2eJc58VC1tW.jpg,"[28, 12, 878]",76341,en,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,10.6392,/hA2ple9q4qnwxp3hKVNhroipsir.jpg,2015-05-13,Mad Max: Fury Road,False,7.627,23503,"[Action, Adventure, Science Fiction]"


In [56]:
# CREATE A FUNCTION TO GET FINANCE INFO
def get_finance_info(movie_id: int) -> dict:
    detail_response = requests.get(url=base_url+details_endpoint.replace("movie_id", str(_id)), headers=headers)

    detail_json = detail_response.json()
    return {
        "id": _id,
        "budget": detail_json.get("budget"),
        "revenue": detail_json.get("revenue")
    }

In [57]:
##### NEED TO ADDRESS GETTING BOOTED BY REMOTE HOST #####

# INITIALIZE INFO LIST
finance_info_list = []

# LOOP THORUGH MOVIE IDS
for index, _id in enumerate(movie_df['id']):

    # CALL THE GET_FINANCE_INFO FUNCTION AND APPEND FINANCE INFO TO INFO LIST
    finance_info_list.append(get_finance_info(movie_id=_id))

    # WAIT BEFORE SENDING ANOTHER REQUEST
    time.sleep(0.25)

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [55]:
# CREATE A DATAFRAME WITH THE INFO LIST
finance_df = pd.DataFrame(data=finance_info_list)
finance_df.head(2)

NameError: name 'finance_info_list' is not defined

In [None]:
# title
# release_year
# genre_ids
# vote_average
# vote_count
# budget
# revenue
# id