# Part 1: Data Gathering
<b>Gather Movie Data via TMDB API.</b>
<br>
<br>
a. Set up the API
Create a free TMDB account
Generate an API key are review their documentation, especially:
* /discover/movie
* /movie/{movie_id}
* /search/movie

b. Collect top movies (2015-2024)
<br>For each year from 2015 to 2024: Query TMDB for the top 100 movies (by vote count).
<br>For each movie, gather:
* Title
* Release Year
* Genre(s)
* Vote Average
* Vote Count
* Budget
* Revenue
* TMDB ID
<br>Store all results in a single DataFrame and export to movies_2015_2024.csv.
<br>Hint: TMDB rate limits are generous for free accounts, but you should pause between requests (eg. time.sleep(0.25)).
<br>Some Oscar films may not appear in the top 100 by vote count. For any missing, use the /search/movie endpoint to add it.

In [14]:
# IMPORT BUILT-IN LIBRARIES
import ast
import json
import re
import requests
import time

# IMPORT 3RD-PARTY LIBRARIES
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from IPython.core.display import HTML

In [2]:
# SET BASE URL
base_url = "https://api.themoviedb.org/3"

# SET ENDPOINTS
auth_endpoint = "/authentication"
discover_endpoint = "/discover/movie"
movie_endpoint = "/movie" # {movie_id}
search_endpoint = "/search/movie"

# FETCH API KEY
with open("../config/api_key.txt") as file:
    api_key = ast.literal_eval(file.read())

# SET HEADERS
headers = {
    "Authorization": f"Bearer {api_key['access_token']}",
    "accept": "application/json"
}

# SEND REQUEST TO VALIDATE AUTHENTICATION
response = requests.get(url=base_url+auth_endpoint, headers=headers)

print(response.text)

{"success":true}


In [19]:
# INITIALIZE MOVIE DICT
movie_dict = {}

# LOOP THE DESIRED RANGE OF YEARS
for year in range(2015, 2025):
    print(f"YEAR: {year}")

    # INITIALIZE MOVIE LIST
    movie_list = []
    
    # ITERATE THE PAGES TO GET 100 MOVIES PER YEAR
    for page_number in range(1, 10):
        
        print(f"LIST LENGTH: {len(movie_list)}")
        
        # IF THE LENGTH OF THE MOVIE LIST IS LESS THAN 100, GET NEXT PAGE
        if len(movie_list) < 100:

            print(f"PAGE: {page_number}")
            
            # SET DISCOVER PARAMS
            params = {
                "sort_by": "vote_count.desc",
                "primary_release_year": year,
                "page": page_number
            }
            
            # SEND REQUEST TO DISCOVER ENDPOINT
            response = requests.get(url=base_url+discover_endpoint, headers=headers, params=params)

            # IF SUCCESSFUL REQUEST...
            if response.status_code == 200:

                # APPEND RESULTS TO MOVIE LIST
                json_body = response.json()
                movie_list = movie_list + json_body.get("results", [])

            # IF FAILED REQUEST...
            else:

                # RAISE EXCEPTION
                raise Exception

            # WAIT BETWEEN REQUESTS
            time.sleep(1)

        # IF THE LENGTH OF THE MOVIE LIST IS 100 OR GREATER, STORE LIST IN MOVIE DICT
        else:
            movie_dict[year] = movie_list
            break

YEAR: 2015
PAGE: 1
LIST LENGTH: 0
PAGE: 2
LIST LENGTH: 20
PAGE: 3
LIST LENGTH: 40
PAGE: 4
LIST LENGTH: 60
PAGE: 5
LIST LENGTH: 80
PAGE: 6
LIST LENGTH: 100
YEAR: 2016
PAGE: 1
LIST LENGTH: 0
PAGE: 2
LIST LENGTH: 20
PAGE: 3
LIST LENGTH: 40
PAGE: 4
LIST LENGTH: 60
PAGE: 5
LIST LENGTH: 80
PAGE: 6
LIST LENGTH: 100
YEAR: 2017
PAGE: 1
LIST LENGTH: 0
PAGE: 2
LIST LENGTH: 20
PAGE: 3
LIST LENGTH: 40
PAGE: 4
LIST LENGTH: 60
PAGE: 5
LIST LENGTH: 80
PAGE: 6
LIST LENGTH: 100
YEAR: 2018
PAGE: 1
LIST LENGTH: 0
PAGE: 2
LIST LENGTH: 20
PAGE: 3
LIST LENGTH: 40
PAGE: 4
LIST LENGTH: 60
PAGE: 5
LIST LENGTH: 80
PAGE: 6
LIST LENGTH: 100
YEAR: 2019
PAGE: 1
LIST LENGTH: 0
PAGE: 2
LIST LENGTH: 20
PAGE: 3
LIST LENGTH: 40
PAGE: 4
LIST LENGTH: 60
PAGE: 5
LIST LENGTH: 80
PAGE: 6
LIST LENGTH: 100
YEAR: 2020
PAGE: 1
LIST LENGTH: 0
PAGE: 2
LIST LENGTH: 20
PAGE: 3
LIST LENGTH: 40
PAGE: 4
LIST LENGTH: 60
PAGE: 5
LIST LENGTH: 80
PAGE: 6
LIST LENGTH: 100
YEAR: 2021
PAGE: 1
LIST LENGTH: 0
PAGE: 2
LIST LENGTH: 20
PAGE: 3
LIST

In [39]:
# CREATE A DATAFRAME FROM THE DICT
movie_df = (
    pd.DataFrame(data=movie_dict.items(), columns=['Year', 'Data'])
    .explode("Data")
    .reset_index(drop=True)
)

In [41]:
movie_df['Data'][0]

{'adult': False,
 'backdrop_path': '/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg',
 'genre_ids': [28, 12, 878],
 'id': 99861,
 'original_language': 'en',
 'original_title': 'Avengers: Age of Ultron',
 'overview': 'When Tony Stark tries to jumpstart a dormant peacekeeping program, things go awry and Earth’s Mightiest Heroes are put to the ultimate test as the fate of the planet hangs in the balance. As the villainous Ultron emerges, it is up to The Avengers to stop him from enacting his terrible plans, and soon uneasy alliances and unexpected action pave the way for an epic and unique global adventure.',
 'popularity': 12.3484,
 'poster_path': '/4ssDuvEDkSArWEdyBl2X5EHvYKU.jpg',
 'release_date': '2015-04-22',
 'title': 'Avengers: Age of Ultron',
 'video': False,
 'vote_average': 7.271,
 'vote_count': 23845}

In [42]:
def populate_column(column: str):
    # IF COLUMN DOES NOT EXIST...
    if column not in movie_df.columns:
        # CREATE COLUMN
        movie_df[column] = ""

    # ITERATE DATAFRAME ROWS
    for index in movie_df.index:
        movie_info = movie_df['Data'].iloc[index]
        movie_df.loc[index, column] = movie_info.get(column.lower(), "")

In [46]:
populate_column(column="Title")
# populate_column(column="Genre_IDs") # Fetch genres
populate_column(column="Vote_Average")
populate_column(column="Vote_Count")
populate_column(column="ID")
# Fetch Budget
# Fetch Revenue
movie_df

Unnamed: 0,Year,Data,Title,Genre_IDs,Vote_Average,Vote_Count,ID
0,2015,"{'adult': False, 'backdrop_path': '/kIBK5SKwgq...",Avengers: Age of Ultron,,7.271,23845,99861
1,2015,"{'adult': False, 'backdrop_path': '/gqrnQA6Xpp...",Mad Max: Fury Road,,7.6,23502,76341
2,2015,"{'adult': False, 'backdrop_path': '/jJKZaTBNen...",Inside Out,,7.91,22914,150540
3,2015,"{'adult': False, 'backdrop_path': '/dF6FjTZzRT...",Jurassic World,,6.699,21093,135397
4,2015,"{'adult': False, 'backdrop_path': '/9pubUbDX3e...",The Martian,,7.69,20576,286217
...,...,...,...,...,...,...,...
995,2024,"{'adult': False, 'backdrop_path': '/ifRqavcREH...",Miller's Girl,,6.348,880,1026436
996,2024,"{'adult': False, 'backdrop_path': '/x0pkoGlwWd...",I'm Still Here,,7.951,859,1000837
997,2024,"{'adult': False, 'backdrop_path': '/oMiKHO3H5R...",Love Lies Bleeding,,6.572,856,948549
998,2024,"{'adult': False, 'backdrop_path': '/jw4SNkaSgb...",Arthur the King,,7.577,855,618588


In [32]:
# ADD REQUIRED COLUMNS
movie_df['Title'] = ""
movie_df['Genres'] = ""
movie_df['Vote_Average'] = 0
movie_df['Vote_Count'] = 0
movie_df['Budget'] = 0
movie_df['Revenue'] = 0
movie_df['TMDB_ID'] = ""

for index in movie_df.index:
    movie_info = movie_df['Movie'].iloc[index]
    title_list = movie_info.get("title", "")
    release_year_list = movie_df['Movie'].iloc[index].get("release_date", "")
    break

print(title_list)

Avengers: Age of Ultron
