In [89]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import requests

API_KEY = "f5813332cb558d374cbcb057ea2fc48b"
link = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

r = requests.get(link)

soup = BeautifulSoup(r.content)

In [None]:
# In the wikipedia page there are multiple tables, the first one is selected (highest-grossing films)
table = soup.find_all("table", {"class": "wikitable"})[0]
rows = table.find_all("tr")

movies = []

for row in rows:
    # Print all text in italic
    if row.find("i"):
        movies.append(row.find("i").text)

# Create dataframe with the movies
movies_df = pd.DataFrame(movies, columns=["Movie"])

In [None]:
def find_movie_id(movie_name):
    """Find the TMDb ID for a given movie name."""
    url = f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&query={movie_name}"
    response = requests.get(url)
    data = response.json()
    # Assuming the first search result is the movie you're looking for
    if data['results']:
        return data['results'][0]['id']
    else:
        return None

def movies_from_ID(person_id):
    url = f"https://api.themoviedb.org/3/person/{person_id}/movie_credits?api_key={api_key}"
    response = requests.get(url, headers=headers)
    data = response.json()
    # Return the whole movie dictionary, not just the title
    return [movie for movie in data['cast'] if movie['release_date'][:4] >= '2010']

In [84]:
api_key = "f5813332cb558d374cbcb057ea2fc48b"
headers = {"accept": "application/json"}

all_people_names = []
all_people_ids = []
for page in range(1, 5):
    url = f"https://api.themoviedb.org/3/person/popular?api_key={api_key}&page={page}"
    response = requests.get(url, headers=headers)
    data = response.json()
    for person in data['results']:
        all_people_names.append(person['name'])
        all_people_ids.append(person['id'])

In [85]:
def movies_from_ID(person_id):
    url = f"https://api.themoviedb.org/3/person/{person_id}/movie_credits?api_key={api_key}"
    response = requests.get(url, headers=headers)
    data = response.json()
    return [movie['title'] for movie in data['cast'] if movie['release_date'][:4] >= '2010']


In [86]:
# Create a DataFrame with 'actors' column
df_actors = pd.DataFrame(all_people_names, columns=['actors'])

# Add 'ids' column to the DataFrame
df_actors['actor_ids'] = all_people_ids

# Fetch movies for each actor and add them to 'movies' column
with concurrent.futures.ThreadPoolExecutor() as executor:
    df_actors['movies'] = list(executor.map(movies_from_ID, df_actors['actor_ids']))


In [None]:
# Add a column with the movie_IDs by applying the find_movie_id function to the Movie column
# Get unique movies
unique_movies = df_actors["movies"].copy().explode().unique()

# Create a new DataFrame with the unique movies
movies_df = pd.DataFrame(unique_movies, columns=['Movie'])
print(len(movies_df))
movies_df =  movies_df.head(10)

movies_df["Movie_ID"] = movies_df["Movie"].apply(find_movie_id)

movie_casts = []
movie_ratings = []
movie_popularities = []
movie_genres = []
movie_revenues = []
movie_release_dates = []

# Retrieve additional information for each movie
headers = {"accept": "application/json"}

for id in movies_df["Movie_ID"]:    
    url = f"https://api.themoviedb.org/3/movie/{id}?api_key={API_KEY}"
    response = requests.get(url, headers=headers)
    data = response.json()

    movie_ratings.append(data['vote_average'])
    movie_popularities.append(data['popularity'])
    movie_genres.append([genre['name'] for genre in data['genres']])
    movie_revenues.append(data['revenue'])
    movie_release_dates.append(data['release_date'])

    url = f"https://api.themoviedb.org/3/movie/{id}/credits?api_key={API_KEY}"
    response = requests.get(url, headers=headers)
    data = response.json()
    
    cast = [actor['name'] for actor in data['cast']]
    movie_casts.append(cast)

# Add the additional information to the dataframe
movies_df["Rating"] = movie_ratings
movies_df["Popularity"] = movie_popularities
movies_df["Genres"] = movie_genres
movies_df["Revenue"] = movie_revenues
movies_df["Release_Date"] = movie_release_dates
movies_df["Cast"] = movie_casts

In [None]:
movies_df

In [92]:
import itertools
import networkx as nx
from collections import defaultdict

def fetch_page(page):
    url = f"https://api.themoviedb.org/3/person/popular?api_key={api_key}&page={page}"
    response = requests.get(url, headers=headers)
    data = response.json()
    return [(person['name'], person['id']) for person in data['results']]

In [93]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    pages = list(range(1, 100))
    people = list(executor.map(fetch_page, pages))

all_people_names, all_people_ids = zip(*itertools.chain(*people))

In [94]:
# Create a DataFrame with 'actors' column
df_actors = pd.DataFrame(all_people_names, columns=['actors'])

# Add 'ids' column to the DataFrame
df_actors['actor_ids'] = all_people_ids

# Fetch movies for each actor and add them to 'movies' column
with concurrent.futures.ThreadPoolExecutor() as executor:
    df_actors['movies'] = list(executor.map(movies_from_ID, df_actors['actor_ids']))

In [95]:
# Step 1: Prepare the Data
movie_to_actors = defaultdict(list)

for _, row in df_actors.iterrows():
    actor = row['actors']
    movies = row['movies']
    for movie in movies:
        movie_to_actors[movie].append(actor)

edges = []
for actors in movie_to_actors.values():
    edges.extend(itertools.combinations(actors, 2))

# Step 2: Create the Graph
G = nx.Graph()
for edge in edges:
    G.add_edge(*edge)

# Step 3: Analyze the Graph
degrees = dict(G.degree())
most_connected_actors = sorted(degrees.items(), key=lambda x: x[1], reverse=True)

In [97]:
print(G)

Graph with 1617 nodes and 41022 edges
