In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import requests

API_KEY = "f5813332cb558d374cbcb057ea2fc48b"

In [None]:
def find_movie_id(movie_name):
    """Find the TMDb ID for a given movie name."""
    url = f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&query={movie_name}"
    response = requests.get(url)
    data = response.json()
    # Assuming the first search result is the movie you're looking for
    if data['results']:
        return data['results'][0]['id']
    else:
        return None

def movie_titles_and_IDs_from_actor_ID(actor_id, session):
    url = f"https://api.themoviedb.org/3/person/{actor_id}/movie_credits?api_key={API_KEY}"
    response = session.get(url)
    data = response.json()
    # Return the whole movie dictionary, not just the title
    return [movie['original_title'] for movie in data['cast']], [movie['id'] for movie in data['cast']]

In [None]:
api_key = "f5813332cb558d374cbcb057ea2fc48b"

all_people_names = []
all_people_ids = []
all_people_genders = []
all_people_birth = []
for page in range(1, 5):
    url_popular = f"https://api.themoviedb.org/3/person/popular?api_key={api_key}&page={page}"
    response = requests.get(url_popular)
    data = response.json()
    for person in data['results']:
        all_people_names.append(person['name'])
        all_people_ids.append(person['id'])
        all_people_genders.append(person['gender'])

        url_actor = f"https://api.themoviedb.org/3/person/{person['id']}?api_key={api_key}"
        data_actor = requests.get(url_actor).json()
        if data_actor['place_of_birth'] is not None:
            all_people_birth.append(data_actor['place_of_birth'].split(",")[-1])
        else:
            all_people_birth.append(None)


In [None]:
# Create a DataFrame with 'actors' column
df_actors = pd.DataFrame(all_people_names, columns=['actors'])

# Add 'ids' column to the DataFrame
df_actors['actor_ids'] = all_people_ids
df_actors['gender'] = all_people_genders
df_actors['place_of_birth'] = all_people_birth


# Fetch movies for each actor and add them to 'movies' column
with concurrent.futures.ThreadPoolExecutor() as executor:
    df_actors['movies'] = list(executor.map(movie_titles_from_actor_ID, df_actors['actor_ids']))
    

In [None]:
df_actors

In [None]:
# Add a column with the movie_IDs by applying the find_movie_id function to the Movie column
# Get unique movies
unique_movies = df_actors["movies"].copy().explode().unique()

# Create a new DataFrame with the unique movies
movies_df = pd.DataFrame(unique_movies, columns=['movies'])

movies_df["movie_ID"] = movies_df["movies"].apply(find_movie_id)

movie_ratings = []
movie_popularities = []
movie_genres = []
movie_revenues = []
movie_release_dates = []
movie_abstracts = []

# Retrieve additional information for each movie
headers = {"accept": "application/json"}

for id in movies_df["movie_ID"]:    
    url = f"https://api.themoviedb.org/3/movie/{id}?api_key={API_KEY}"
    response = requests.get(url, headers=headers)
    data = response.json()

    movie_ratings.append(data['vote_average'])
    movie_popularities.append(data['popularity'])
    movie_genres.append([genre['name'] for genre in data['genres']])
    movie_revenues.append(data['revenue'])
    movie_release_dates.append(data['release_date'])
    movie_abstracts.append(data['overview'])

    url = f"https://api.themoviedb.org/3/movie/{id}/credits?api_key={API_KEY}"
    response = requests.get(url, headers=headers)
    data = response.json()

# Add the additional information to the dataframe
movies_df["rating"] = movie_ratings
movies_df["popularity"] = movie_popularities
movies_df["genres"] = movie_genres
movies_df["revenue"] = movie_revenues
movies_df["release_date"] = movie_release_dates
movies_df["abstract"] = movie_abstracts

In [None]:
movies_df

In [None]:
import itertools
import networkx as nx
from collections import defaultdict
from functools import partial

headers = {"accept": "application/json"}
api_key = "f5813332cb558d374cbcb057ea2fc48b"

In [None]:
def fetch_page(page, session):
    url = f"https://api.themoviedb.org/3/person/popular?api_key={api_key}&page={page}"
    response = session.get(url)
    data = response.json()
    people = []
    for person in data['results']:
        person_url = f"https://api.themoviedb.org/3/person/{person['id']}?api_key={api_key}"
        person_response = session.get(person_url)
        person_data = person_response.json()
        people.append((person['name'], person['id'], person_data['gender'], person_data['place_of_birth']))
    return people

with requests.Session() as session:
    session.headers.update(headers)
    with concurrent.futures.ThreadPoolExecutor() as executor:
        pages = list(range(1, 5))
        fetch_page_with_session = partial(fetch_page, session=session)
        people = list(executor.map(fetch_page_with_session, pages))

all_people_names, all_people_ids, all_people_genders, all_people_birthplaces = zip(*itertools.chain(*people))

In [None]:
# Create a DataFrame with 'actors' column
df_actors = pd.DataFrame(all_people_names, columns=['actors'])

# Add 'ids', 'genders', and 'birthplaces' columns to the DataFrame
df_actors['actor_ids'] = all_people_ids
df_actors['genders'] = all_people_genders
df_actors['birthplaces'] = all_people_birthplaces

# Fetch movies for each actor and add them to 'movies' and 'movie_IDs' columns
with requests.Session() as session:
    session.headers.update(headers)
    movie_titles_and_ids_from_actor_ID_with_session = partial(movie_titles_and_IDs_from_actor_ID, session=session)
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        movies_and_ids = list(executor.map(movie_titles_and_ids_from_actor_ID_with_session, df_actors['actor_ids']))

# Add 'movies' column to df_actors
df_actors['movies'] = [x[0] for x in movies_and_ids]

# Flatten movies_and_ids into two separate lists
movies = [item for sublist in [x[0] for x in movies_and_ids] for item in sublist]
ids = [item for sublist in [x[1] for x in movies_and_ids] for item in sublist]

# Convert the lists into a list of dictionaries
movies_and_ids_dict = [{'movie': movie, 'movie_ID': id} for movie, id in zip(movies, ids)]

# Convert the list of dictionaries into a DataFrame
movies_df = pd.DataFrame(movies_and_ids_dict)
movies_df = movies_df.drop_duplicates()

In [None]:
import aiohttp
import asyncio

headers = {"accept": "application/json"}

# Define an asynchronous function to fetch data from the API
async def fetch(session, url):
    async with session.get(url, headers=headers) as response:
        return await response.json()

# Define an asynchronous function to handle the API requests
async def fetch_all(urls):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(fetch(session, url))
        responses = await asyncio.gather(*tasks)
        return responses

In [None]:
from tqdm import tqdm
# Prepare the URLs
urls = [f"https://api.themoviedb.org/3/movie/{id}?api_key={API_KEY}" for id in movies_df["movie_ID"]]

# Fetch all responses
responses = await fetch_all(urls)

# Initialize empty lists to store the data
ratings = []
popularities = []
genres = []
revenues = []
release_dates = []
abstracts = []

# Process the responses one by one
for data in tqdm(responses):
    ratings.append(data['vote_average'])
    popularities.append(data['popularity'])
    genres.append([genre['name'] for genre in data['genres']])
    revenues.append(data['revenue'])
    release_dates.append(data['release_date'])
    abstracts.append(data['overview'])

# Assign the lists to the DataFrame columns
movies_df['rating'] = ratings
movies_df['popularity'] = popularities
movies_df['genres'] = genres
movies_df['revenue'] = revenues
movies_df['release_date'] = release_dates
movies_df['abstract'] = abstracts

In [None]:
# Step 1: Prepare the Data
movie_to_actors = defaultdict(list)

for _, row in df_actors.iterrows():
    actor = row['actors']
    movies = row['movies']
    for movie in movies:
        movie_to_actors[movie].append(actor)

edges = []
for actors in movie_to_actors.values():
    edges.extend(itertools.combinations(actors, 2))

# Step 2: Create the Graph
G = nx.Graph()

# Add nodes with attributes
for _, row in df_actors.iterrows():
    G.add_node(row['actors'], gender=row['genders'], birthplace=row['birthplaces'])

# Add edges
for edge in edges:
    G.add_edge(*edge)

# Step 3: Analyze the Graph
degrees = dict(G.degree())
most_connected_actors = sorted(degrees.items(), key=lambda x: x[1], reverse=True)

In [None]:
print(G)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Calculate degree distribution
degree_sequence = sorted([d for n, d in G.degree()], reverse=False)  # sort in ascending order
degree_count = nx.degree_histogram(G)

# Plot degree distribution
plt.bar(range(len(degree_count)), degree_count, align='center')
plt.xlabel('Degree')
plt.ylabel('Count')
plt.title('Degree Distribution')
plt.show()


In [None]:
#make a log plot
plt.bar(range(len(degree_count)), degree_count, align='center')
plt.yscale('log')
plt.xlabel('Degree')
plt.ylabel('Count')
plt.title('Degree Distribution (log scale)')