In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

API_KEY = "f5813332cb558d374cbcb057ea2fc48b"
link = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

r = requests.get(link)

soup = BeautifulSoup(r.content)

In [9]:
# In the wikipedia page there are multiple tables, the first one is selected (highest-grossing films)
table = soup.find_all("table", {"class": "wikitable"})[0]
rows = table.find_all("tr")

movies = []

for row in rows:
    # Print all text in italic
    if row.find("i"):
        movies.append(row.find("i").text)

# Create dataframe with the movies
movies_df = pd.DataFrame(movies, columns=["Movie"])

In [10]:
def find_movie_id(movie_name):
    """Find the TMDb ID for a given movie name."""
    url = f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&query={movie_name}"
    response = requests.get(url)
    data = response.json()
    # Assuming the first search result is the movie you're looking for
    if data['results']:
        return data['results'][0]['id']
    else:
        return None

In [13]:
# Add a column with the movie_IDs by applying the find_movie_id function to the Movie column
movies_df["Movie_ID"] = movies_df["Movie"].apply(find_movie_id)

movie_casts = []
movie_ratings = []
movie_popularities = []
movie_genres = []
movie_revenues = []
movie_release_dates = []

# Retrieve additional information for each movie
headers = {"accept": "application/json"}

for id in movies_df["Movie_ID"]:    
    url = f"https://api.themoviedb.org/3/movie/{id}?api_key={API_KEY}"
    response = requests.get(url, headers=headers)
    data = response.json()

    movie_ratings.append(data['vote_average'])
    movie_popularities.append(data['popularity'])
    movie_genres.append([genre['name'] for genre in data['genres']])
    movie_revenues.append(data['revenue'])
    movie_release_dates.append(data['release_date'])

    url = f"https://api.themoviedb.org/3/movie/{id}/credits?api_key={API_KEY}"
    response = requests.get(url, headers=headers)
    data = response.json()
    
    cast = [actor['name'] for actor in data['cast']]
    movie_casts.append(cast)

# Add the additional information to the dataframe
movies_df["Rating"] = movie_ratings
movies_df["Popularity"] = movie_popularities
movies_df["Genres"] = movie_genres
movies_df["Revenue"] = movie_revenues
movies_df["Release_Date"] = movie_release_dates
movies_df["Cast"] = movie_casts

In [14]:
movies_df

Unnamed: 0,Movie,Movie_ID,Rating,Popularity,Genres,Revenue,Release_Date,Cast
0,Avatar,19995,7.58,117.894,"[Action, Adventure, Fantasy, Science Fiction]",2923706026,2009-12-15,"[Sam Worthington, Zoe Saldaña, Sigourney Weave..."
1,Avengers: Endgame,299534,8.256,115.419,"[Adventure, Science Fiction, Action]",2800000000,2019-04-24,"[Robert Downey Jr., Chris Evans, Mark Ruffalo,..."
2,Avatar: The Way of Water,76600,7.627,252.369,"[Science Fiction, Adventure, Action]",2320250281,2022-12-14,"[Sam Worthington, Zoe Saldaña, Sigourney Weave..."
3,Titanic,597,7.904,126.582,"[Drama, Romance]",2264162353,1997-11-18,"[Leonardo DiCaprio, Kate Winslet, Billy Zane, ..."
4,Star Wars: The Force Awakens,140607,7.285,56.305,"[Adventure, Action, Science Fiction]",2068223624,2015-12-15,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad..."
5,Avengers: Infinity War,299536,8.248,178.177,"[Adventure, Action, Science Fiction]",2052415039,2018-04-25,"[Robert Downey Jr., Chris Hemsworth, Mark Ruff..."
6,Spider-Man: No Way Home,634649,7.972,226.835,"[Action, Adventure, Science Fiction]",1921847111,2021-12-15,"[Tom Holland, Zendaya, Benedict Cumberbatch, J..."
7,Jurassic World,135397,6.685,80.319,"[Action, Adventure, Science Fiction, Thriller]",1671537444,2015-06-06,"[Chris Pratt, Bryce Dallas Howard, Ty Simpkins..."
8,The Lion King,8587,8.257,102.636,"[Family, Animation, Drama]",763455561,1994-06-24,"[Matthew Broderick, Moira Kelly, Nathan Lane, ..."
9,The Avengers,24428,7.712,110.265,"[Science Fiction, Action, Adventure]",1518815515,2012-04-25,"[Robert Downey Jr., Chris Evans, Mark Ruffalo,..."
