In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import requests
import itertools
import networkx as nx
from collections import defaultdict
from functools import partial
import aiohttp
import asyncio
from tqdm import tqdm
from requests_futures.sessions import FuturesSession

headers = {"accept": "application/json"}
api_key = "f5813332cb558d374cbcb057ea2fc48b"

In [None]:
def movie_titles_and_IDs_from_actor_ID(actor_id, session):
    url = f"https://api.themoviedb.org/3/person/{actor_id}/movie_credits?api_key={api_key}"
    response = session.get(url)
    data = response.json()
    # Return the whole movie dictionary, not just the title
    return [movie['original_title'] for movie in data['cast']], [movie['id'] for movie in data['cast']]

In [None]:
def fetch_page(page, session):
    url = f"https://api.themoviedb.org/3/person/popular?api_key={api_key}&page={page}"
    response = session.get(url)
    data = response.json()
    people = []
    for person in data['results']:
        person_url = f"https://api.themoviedb.org/3/person/{person['id']}?api_key={api_key}"
        person_response = session.get(person_url)
        person_data = person_response.json()
        people.append((person['name'], person['id'], person_data['gender'], person_data['place_of_birth']))
    return people

with requests.Session() as session:
    session.headers.update(headers)
    with concurrent.futures.ThreadPoolExecutor() as executor:
        pages = list(range(1, 51))
        fetch_page_with_session = partial(fetch_page, session=session)
        people = list(executor.map(fetch_page_with_session, pages))

all_people_names, all_people_ids, all_people_genders, all_people_birthplaces = zip(*itertools.chain(*people))

In [None]:
# Create a DataFrame with 'actors' column
df_actors = pd.DataFrame(all_people_names, columns=['actors'])

# Add 'ids', 'genders', and 'birthplaces' columns to the DataFrame
df_actors['actor_ids'] = all_people_ids
df_actors['genders'] = all_people_genders
df_actors['birthplaces'] = all_people_birthplaces

# Fetch movies for each actor and add them to 'movies' and 'movie_IDs' columns
with requests.Session() as session:
    session.headers.update(headers)
    movie_titles_and_ids_from_actor_ID_with_session = partial(movie_titles_and_IDs_from_actor_ID, session=session)
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        movies_and_ids = list(executor.map(movie_titles_and_ids_from_actor_ID_with_session, df_actors['actor_ids']))

# Add 'movies' column to df_actors
df_actors['movies'] = [x[0] for x in movies_and_ids]

# Flatten movies_and_ids into two separate lists
movies = [item for sublist in [x[0] for x in movies_and_ids] for item in sublist]
ids = [item for sublist in [x[1] for x in movies_and_ids] for item in sublist]

# Convert the lists into a list of dictionaries
movies_and_ids_dict = [{'movie': movie, 'movie_ID': id} for movie, id in zip(movies, ids)]

# Convert the list of dictionaries into a DataFrame
movies_df = pd.DataFrame(movies_and_ids_dict)
movies_df = movies_df.drop_duplicates()

In [None]:
movies_df

In [None]:
def fetch(session, url):
    future = session.get(url, headers=headers)
    return future

def fetch_all(urls):
    with FuturesSession() as session:
        futures = [fetch(session, url) for url in urls]
        responses = [future.result().json() for future in futures]
    return responses

In [None]:
# Prepare the URLs
urls = [f"https://api.themoviedb.org/3/movie/{id}?api_key={api_key}" for id in movies_df["movie_ID"]]

# Fetch all responses
responses = fetch_all(urls)

# Process the responses
for response in responses:
    if isinstance(response, Exception):
        print(f"Error: {response}")
        continue  # Skip this response
    # Process the response here

# Initialize empty lists to store the data
ratings = []
popularities = []
genres = []
revenues = []
release_dates = []
abstracts = []

# Process the responses one by one
for data in tqdm(responses):
    ratings.append(data.get('vote_average'))
    popularities.append(data.get('popularity'))
    genres.append([genre['name'] for genre in data.get('genres', [])])
    revenues.append(data.get('revenue'))
    release_dates.append(data.get('release_date'))
    abstracts.append(data.get('overview'))
    
# Assign the lists to the DataFrame columns
movies_df['rating'] = ratings
movies_df['popularity'] = popularities
movies_df['genres'] = genres
movies_df['revenue'] = revenues
movies_df['release_date'] = release_dates
movies_df['abstract'] = abstracts

In [None]:
movies_df

In [None]:
# Combine the two dataframes so that each actor is associated with the movies they have acted in
df_actors_filtered = df_actors.copy()
df_movies_filtered = movies_df.copy()
df_movies_filtered.rename(columns={'movie': 'movies'}, inplace=True)

df_actors_filtered = df_actors_filtered.explode('movies').reset_index(drop=True)

df_actors_movies = df_actors_filtered.merge(df_movies_filtered, on='movies', how='left')

# Remove all rows where the release date is before 2010
df_actors_movies = df_actors_movies[(df_actors_movies['release_date'] >= '2010-01-01') & (df_actors_movies['release_date'] <= '2024-04-16')]

# Collapse the the actors in the actor column so there is only one row per actor and the movies and movie_IDs are stored in lists
df_actors_filtered = df_actors_movies.groupby('actors').agg({'actor_ids': 'first',
                                                             'genders': 'first',
                                                             'birthplaces': 'first',
                                                             'movies': list, 
                                                             'movie_ID': list}).reset_index()

df_movies_filtered = df_actors_movies.drop_duplicates(subset=['movies']).groupby('movies').agg({'movie_ID': 'first',
                                                                                               'rating': 'first',
                                                                                               'popularity': 'first',
                                                                                               'genres': 'first',
                                                                                               'revenue': 'first',
                                                                                               'release_date': 'first',
                                                                                               'abstract': 'first'}).reset_index()

In [None]:
df_actors_filtered

In [None]:
df_movies_filtered

In [None]:
# Remove all rows where the revenue is less than $1
print((df_movies_filtered['revenue'] >= 1).value_counts().loc[True])

# Count how many times the revenue is 0
print(df_movies_filtered['revenue'].value_counts().loc[0])

# Count how many times there is no abstract
print((df_movies_filtered['abstract'] == "").value_counts().loc[True])

In [None]:
# Step 1: Prepare the Data
movie_to_actors = defaultdict(list)

for _, row in df_actors_filtered.iterrows():
    actor = row['actors']
    movies = row['movies']
    for movie in movies:
        movie_to_actors[movie].append(actor)

edges = []
for actors in movie_to_actors.values():
    edges.extend(itertools.combinations(actors, 2))

# Step 2: Create the Graph
G = nx.Graph()

# Add nodes with attributes
for _, row in df_actors_filtered.iterrows():
    G.add_node(row['actors'], gender=row['genders'], birthplace=row['birthplaces'])

# Add edges
for edge in edges:
    G.add_edge(*edge)

# Step 3: Analyze the Graph
degrees = dict(G.degree())
most_connected_actors = sorted(degrees.items(), key=lambda x: x[1], reverse=True)

In [None]:
print(G)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Calculate degree distribution
degree_sequence = sorted([d for n, d in G.degree()], reverse=False)  # sort in ascending order
degree_count = nx.degree_histogram(G)

# Plot degree distribution
plt.bar(range(len(degree_count)), degree_count, align='center')
plt.xlabel('Degree')
plt.ylabel('Count')
plt.title('Degree Distribution')
plt.show()


In [None]:
#make a log plot
plt.bar(range(len(degree_count)), degree_count, align='center')
plt.yscale('log')
plt.xlabel('Degree')
plt.ylabel('Count')
plt.title('Degree Distribution (log scale)')