# Network Analysis

This notebook supports the analysis part of our project. We assume that you already have a DuckDB instance filled with necessary tables. If not, please visit `setup.ipynb`.

Let's start with basic imports and connecting to our database.

In [1]:
#!pip install -r requirements.txt
#!pip install duckdb

In [2]:
import duckdb
import pandas as pd

# Connect to a persistent DuckDB database file
conn = duckdb.connect("imdb.duckdb")

As mentioned in the setup, we are dealing with 7 tables:
1. `name_basics`
2. `title_akas`
3. `title_basics`
4. `title_crew`
5. `title_episode`
6. `title_principals`
7. `title_ratings`

The following query shows a detailed overview about our schema.

In [3]:
df = conn.execute("""
SELECT table_name, column_name, data_type
FROM information_schema.columns
WHERE table_schema = 'main'
ORDER BY table_name, ordinal_position;
""").df()

display(df)

Unnamed: 0,table_name,column_name,data_type
0,name_basics,nconst,VARCHAR
1,name_basics,primary_name,VARCHAR
2,name_basics,birth_year,INTEGER
3,name_basics,death_year,INTEGER
4,name_basics,primary_profession,VARCHAR[]
5,name_basics,known_for_titles,VARCHAR[]
6,title_akas,title_id,VARCHAR
7,title_akas,CAST(ordering AS INTEGER),INTEGER
8,title_akas,title,VARCHAR
9,title_akas,region,VARCHAR


The following cell lists all titles and actors along with other interesting information (average rating, runtime, country, etc.). This should be useful for analysis.

In [4]:
df = conn.execute("""
    SELECT 
        tb.tconst,
        tb.primary_title AS movie_title,
        tb.start_year,
        tb.runtime_minutes,
        tb.genres,
        tr.average_rating,
        tr.num_votes,
        nb.primary_name AS actor_name,
        nb.birth_year,
        nb.primary_profession,
        tp.category,
        tp.characters,
        ta.region
    FROM title_basics tb
    JOIN title_ratings tr 
        ON tb.tconst = tr.tconst
    JOIN title_principals tp
        ON tb.tconst = tp.tconst
    JOIN name_basics nb
        ON tp.nconst = nb.nconst
    JOIN title_akas ta
        ON tb.tconst = ta.title_id
    WHERE tb.title_type = 'movie'
        AND tb.start_year >= 2010
        AND tb.start_year <= 2024
        AND ta.region = 'US'
        AND tp.category IN ('actor', 'actress')
        AND tb.genres IS NOT NULL AND array_length(genres) > 0
        AND tb.runtime_minutes IS NOT NULL AND tb.runtime_minutes > 15
    ORDER BY tr.average_rating DESC, tr.num_votes DESC
""").df()

display(df.head(10))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,tconst,movie_title,start_year,runtime_minutes,genres,average_rating,num_votes,actor_name,birth_year,primary_profession,category,characters,region
0,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Cookie Pearl Reid,,"[actress, miscellaneous]",actress,"[""Susan""]",US
1,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Brian 'Da Wildcat' Smith,,"[actor, writer, producer]",actor,"[""Grimmy""]",US
2,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,King B.,,"[actor, director, producer]",actor,"[""Royal T.""]",US
3,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Brandall Cole,,"[actor, writer, director]",actor,"[""Robber""]",US
4,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Damn Fool,,"[actor, writer]",actor,"[""Ed""]",US
5,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Brandon Glover,,[actor],actor,"[""Chump""]",US
6,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Oz Man,,[actor],actor,"[""Ant""]",US
7,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Maria Geiger,,[actress],actress,"[""Donna""]",US
8,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Tiphanie Nichole Rae,,[actress],actress,"[""Rachael""]",US
9,tt15461714,King B.'s Hate... Love,2021,85,[Drama],10.0,14,Olivia Gant,,[actress],actress,"[""Eve""]",US


In [5]:
len(df)

733067

In [6]:
df.drop_duplicates(subset=['tconst', 'actor_name', 'category'], inplace=True)
len(df)

613700

## Collaboration graph

In [7]:
!pip install networkx




[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import networkx as nx
from itertools import combinations

G_collabs = nx.Graph()

for tconst, group in df.groupby('tconst'):
    actors = group['actor_name'].tolist()
    for actor1, actor2 in combinations(actors, 2):
        if actor1 == actor2:
            continue
        # Add nodes excplicitly to add name attributes
        if not G_collabs.has_node(actor1):
            G_collabs.add_node(actor1, name=actor1)
        if not G_collabs.has_node(actor2):
            G_collabs.add_node(actor2, name=actor2)
        # Add edge with weight
        if G_collabs.has_edge(actor1, actor2):
            G_collabs[actor1][actor2]['weight'] += 1
        else:
            G_collabs.add_edge(actor1, actor2, weight=1)


In [9]:
# Group by actor_name and count the number of movies
actor_movie_counts = df.groupby('actor_name').size().reset_index(name='movie_count')

# Sort by movie count in descending order
sorted_actors = actor_movie_counts.sort_values('movie_count', ascending=False)

# Get the actor with the most movies
most_movies_actor = sorted_actors.iloc[0]
actor_name_most = most_movies_actor['actor_name']
num_movies_most = most_movies_actor['movie_count']

# Get the actor with the second most movies
second_most_movies_actor = sorted_actors.iloc[1]
actor_name_second = second_most_movies_actor['actor_name']
num_movies_second = second_most_movies_actor['movie_count']

print(f"Actor with the most movies: {actor_name_most} (Number of movies: {num_movies_most})")
print(f"Actor with the second most movies: {actor_name_second} (Number of movies: {num_movies_second})")

Actor with the most movies: Eric Roberts (Number of movies: 272)
Actor with the second most movies: Danny Trejo (Number of movies: 106)


In [10]:
import networkx as nx

# Calculate total collaborations per actor (sum of edge weights)
collaboration_counts = {
    node: sum(data['weight'] for _, _, data in G_collabs.edges(node, data=True))
    for node in G_collabs.nodes
}

sorted_actors = sorted(collaboration_counts.items(), key=lambda x: x[1], reverse=True)
most_connected_actor, total_collaborations = sorted_actors[0]
second_connected_actor, second_total_collaborations = sorted_actors[1]

# Find how many unique actors they collaborated with (degree)
first_unique_collaborators = len(list(G_collabs.neighbors(most_connected_actor)))
second_unique_collaborators = len(list(G_collabs.neighbors(second_connected_actor)))

print(f"Most connected actor: {most_connected_actor}")
print(f"Total collaborations: {total_collaborations}")
print(f"Unique actors collaborated with: {first_unique_collaborators}")

print(f"Second most connected actor: {second_connected_actor}")
print(f"Total collaborations: {second_total_collaborations}")
print(f"Unique actors collaborated with: {second_unique_collaborators}")


Most connected actor: Eric Roberts
Total collaborations: 2394
Unique actors collaborated with: 2131
Second most connected actor: Danny Trejo
Total collaborations: 943
Unique actors collaborated with: 835


In [11]:
# Sort actors by the number of collabs (degree) in descending order
sorted_actors = sorted(G_collabs.degree, key=lambda x: x[1], reverse=True)

# Actor with the most collabs
most_collabs_actor = sorted_actors[0]
# Get the actor with the second most collabs
second_most_collabs_actor = sorted_actors[1]

# Extract the actor's name and number of collabs
actor_name_most_collabs = G_collabs.nodes[most_collabs_actor[0]]['name']
num_collabs_most = most_collabs_actor[1]

# Extract the actor's name and number of collabs
actor_name_2most_collabs = G_collabs.nodes[second_most_collabs_actor[0]]['name']
num_collabs_second = second_most_collabs_actor[1]

print(f"Actor with the most collabs: {actor_name_most_collabs} (Number of collabs: {num_collabs_most})")
print(f"Actor with the second most collabs: {actor_name_2most_collabs} (Number of collabs: {num_collabs_second})")

Actor with the most collabs: Eric Roberts (Number of collabs: 2131)
Actor with the second most collabs: Danny Trejo (Number of collabs: 835)


### PageRank as a measure of influence
why: An actor who collaborates with many well-connected actors gets a higher score, 
meaning that the number of connections is not as important as being connected to well-connected actors.
It handles weighted graphs well.

pagerank - who has reach
eigenvector- who's elite

In [13]:
!pip install scipy

^C


In [12]:
pagerank = nx.pagerank(G_collabs, weight='weight')
nx.set_node_attributes(G_collabs, pagerank, name='pagerank')

eigen_centrality = nx.eigenvector_centrality(G_collabs, max_iter=1000, weight='weight')
nx.set_node_attributes(G_collabs, eigen_centrality, name='eigen_centrality')

ModuleNotFoundError: No module named 'scipy'

In [None]:
!pip install sklearn

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

centrality_df = pd.DataFrame.from_dict(pagerank, orient='index', columns=['pagerank'])
centrality_df['eigenvector'] = pd.Series(eigen_centrality)
# Scale pagerank to range [0, 0.5]
pagerank_scaler = MinMaxScaler(feature_range=(0, 0.5))
centrality_df['pagerank_scaled'] = pagerank_scaler.fit_transform(centrality_df[['pagerank']])
# Scale eigenvector centrality to range [0, 0.5]
eigenvector_scaler = MinMaxScaler(feature_range=(0, 0.5))
centrality_df['eigenvector_scaled'] = eigenvector_scaler.fit_transform(centrality_df[['eigenvector']])
# Combine scaled pagerank and eigenvector centrality
centrality_df['page_plus_eigen_scaled'] = centrality_df['pagerank_scaled'] + centrality_df['eigenvector_scaled']

degree_dict = dict(G_collabs.degree())  # Unweighted degree — just the number of neighbors
centrality_df['degree'] = pd.Series(degree_dict)

scaler = StandardScaler()
# KMEANS clustering on pagerank
X = scaler.fit_transform(centrality_df[['pagerank']])
kmeans = KMeans(n_clusters=4, random_state=42)
centrality_df['tier_page'] = kmeans.fit_predict(X)

# KMEANS clustering on eigenvector centrality
scaler = StandardScaler()
X = scaler.fit_transform(centrality_df[['eigenvector']])
kmeans = KMeans(n_clusters=4, random_state=42)
centrality_df['tier_eigen'] = kmeans.fit_predict(X)

#KMEANS clustering on pagerank + eigenvector centrality
scaler = StandardScaler()
X = scaler.fit_transform(centrality_df[['page_plus_eigen_scaled']])
kmeans = KMeans(n_clusters=4, random_state=42)
centrality_df['tier_combined'] = kmeans.fit_predict(X)

# Map sorted cluster index to labels to have meaningful names
labels = ['A-list', 'B-list', 'C-list', 'D-list']
tier_means = centrality_df.groupby('tier_page')['pagerank'].mean().sort_values(ascending=False)
tier_to_label = {tier: label for tier, label in zip(tier_means.index, labels)}

centrality_df['tier_label_pr'] = centrality_df['tier_page'].map(tier_to_label)

labels = ['A-list', 'B-list', 'C-list', 'D-list']
tier_means = centrality_df.groupby('tier_eigen')['eigenvector'].mean().sort_values(ascending=False)
tier_to_label = {tier: label for tier, label in zip(tier_means.index, labels)}

centrality_df['tier_label_ev'] = centrality_df['tier_eigen'].map(tier_to_label)

labels = ['A-list', 'B-list', 'C-list', 'D-list']
tier_means = centrality_df.groupby('tier_combined')['page_plus_eigen_scaled'].mean().sort_values(ascending=False)
tier_to_label = {tier: label for tier, label in zip(tier_means.index, labels)}

centrality_df['tier_label_comb'] = centrality_df['tier_combined'].map(tier_to_label)
centrality_df['name'] = centrality_df.index.map(lambda node: G_collabs.nodes[node]['name'])

# Add tier labels to nodes in the graph
for node, data in centrality_df.iterrows():
    if node in G_collabs.nodes:
        G_collabs.nodes[node]['tier_label_pr'] = data['tier_label_pr']  # Pagerank-based tier
        G_collabs.nodes[node]['tier_label_ev'] = data['tier_label_ev']  # Eigenvector-based tier
        G_collabs.nodes[node]['tier_label_comb'] = data['tier_label_comb']  # Combined tier

In [None]:
G_collabs.nodes.get('Eric Roberts')

In [None]:
!pip install seaborn

In [None]:
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize a dictionary to store connections between tiers
tier_connections = defaultdict(lambda: defaultdict(int))

# Iterate through all edges in the graph
for node1, node2 in G_collabs.edges():
    # Get the tiers of the two connected nodes
    tier1 = centrality_df.loc[node1, 'tier_label_comb'] if node1 in centrality_df.index else 'Unknown'
    tier2 = centrality_df.loc[node2, 'tier_label_comb'] if node2 in centrality_df.index else 'Unknown'
    
    # Increment the connection count for the tier pair
    tier_connections[tier1][tier2] += 1
    if tier1 != tier2:
        tier_connections[tier2][tier1] += 1  # Count the reverse connection as well

# Calculate tier sizes
tier_sizes = centrality_df['tier_label_comb'].value_counts().to_dict()
tier_percentages = defaultdict(lambda: defaultdict(float))

# Calculate percentages and display results
for tier1 in sorted(tier_connections.keys()):
    total_connections = sum(tier_connections[tier1].values())
    print(f"Tier {tier1} connections:")
    for tier2, count in tier_connections[tier1].items():
        percentage_of_tier1 = (count / total_connections) * 100 if total_connections > 0 else 0
        tier_percentages[tier1][tier2] = percentage_of_tier1
        print(f"  To Tier {tier2}: {count} connections ({percentage_of_tier1:.2f}%)")
    print()

# Convert to a DataFrame for visualization
tier_percentages_df = pd.DataFrame(tier_percentages).fillna(0)
tier_order = ['A-list', 'B-list', 'C-list', 'D-list']
tier_percentages_df = tier_percentages_df.reindex(index=tier_order, columns=tier_order, fill_value=0)#.T
print(tier_percentages_df)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(tier_percentages_df, annot=True, fmt=".2f", cmap="Blues", cbar_kws={'label': 'Percentage of Connections (%)'})
plt.title("Percentage of Connections Between Tiers")
plt.xlabel("Source Tier")
plt.ylabel("Target Tier")
plt.tight_layout()
plt.show()

In [None]:

for tier in sorted(centrality_df['tier_label_pr'].unique()):
    sub = centrality_df[centrality_df['tier_label_pr'] == tier]
    print(f"Tier based on PageRank {tier}:")
    print("  Avg PageRank:", round(sub['pagerank'].mean(), 7))
    print("  Avg Eigenvector:", round(sub['eigenvector'].mean(), 7))
    print("  Avg Page + Eigen:", round(sub['page_plus_eigen_scaled'].mean(), 4))
    print("  Avg Nr of Degrees:", round(sub['degree'].mean(), 2))
    print("  Median Nr of Degrees:", round(sub['degree'].median(), 2))
    print("  Nr of actors/actresses:", len(sub))
    print("  Sample actors:", [a for a in sub['name'].head(5)])

print('-------------------------------------')
for tier in sorted(centrality_df['tier_label_ev'].unique()):
    sub = centrality_df[centrality_df['tier_label_ev'] == tier]
    print(f"Tier based on Eigenvector {tier}:")
    print("  Avg PageRank:", round(sub['pagerank'].mean(), 7))
    print("  Avg Eigenvector:", round(sub['eigenvector'].mean(), 7))
    print("  Avg Page + Eigen:", round(sub['page_plus_eigen_scaled'].mean(), 4))
    print("  Avg Nr of Degrees:", round(sub['degree'].mean(), 2))
    print("  Median Nr of Degrees:", round(sub['degree'].median(), 2))
    print("  Nr of actors/actresses:", len(sub))
    print("  Sample actors:", [a for a in sub['name'].head(5)])

print('-------------------------------------')
for tier in sorted(centrality_df['tier_label_comb'].unique()):
    sub = centrality_df[centrality_df['tier_label_comb'] == tier]
    print(f"Tier based PageRank & Eigenvector {tier}:")
    print("  Avg PageRank:", round(sub['pagerank'].mean(), 7))
    print("  Avg Eigenvector:", round(sub['eigenvector'].mean(), 7))
    print("  Avg Page + Eigen:", round(sub['page_plus_eigen_scaled'].mean(), 4))
    print("  Median Nr of Degrees:", round(sub['degree'].median(), 2))
    print("  Avg Nr of Degrees:", round(sub['degree'].mean(), 2))
    print("  Nr of actors/actresses:", len(sub))
    print("  Sample actors:", [a for a in sub['name'].head(5)])

In [None]:
print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'A-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'B-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'C-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'D-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

In [None]:
import matplotlib.pyplot as plt

eric_roberts_node = centrality_df[centrality_df['name'] == 'Eric Roberts'].index[0]
# Get the ego graph for Eric Roberts with immediate neighbors
ego_graph = nx.ego_graph(G_collabs, eric_roberts_node, radius=1, center=True, undirected=True)

# Filter edges to include only those connected to Eric Roberts
filtered_edges = [(eric_roberts_node, neighbor) for neighbor in ego_graph.neighbors(eric_roberts_node)]
filtered_weights = [ego_graph[eric_roberts_node][neighbor]['weight'] for neighbor in ego_graph.neighbors(eric_roberts_node)]


plt.figure(figsize=(60, 50))
pos = nx.spring_layout(ego_graph, k=0.5, seed=42, weight='weight')
# Define a color map for tiers
tier_colors = {
    'A-list': 'gold',
    'B-list': 'silver',
    'C-list': 'lightblue',
    'D-list': 'gray'
}
# Get the tier for each node
node_colors = [
    tier_colors[centrality_df.loc[node, 'tier_label_comb']]
    if node in centrality_df.index else 'gray'
    for node in ego_graph.nodes()
]
options = {
    "node_color": node_colors,
    "edge_color": filtered_weights,
    "edgelist": filtered_edges,
    "edge_cmap": plt.cm.Reds,
    "with_labels": True,
    "labels": {n: G_collabs.nodes[n]['name'] for n in ego_graph.nodes()},
    "node_size": 1000,
    "font_size": 10
}
print(f"Number of nodes in the graph: {ego_graph.number_of_nodes()}")
nx.draw(ego_graph, pos, **options)
plt.title("Eric Roberts' Closest Neighbors (Tier Colors)")
plt.show()


In [None]:
actor_name = 'Bill Barretta'
actor_info = centrality_df[centrality_df['name'] == actor_name]
actor_info[['pagerank', 'eigenvector', 'degree', 'tier_label_pr', 'tier_label_ev', 'tier_label_comb']]

In [None]:
def get_movies_by_actor(actor_name, df):
    # Filter rows where the actor appears
    actor_movies = df[df['actor_name'] == actor_name]
    # Select relevant columns (e.g., movie title, average rating, year)
    return actor_movies[['movie_title', 'average_rating', 'start_year']].drop_duplicates()

# Example usage
actor_name = "Eric Roberts"
movies = get_movies_by_actor(actor_name, df)
print(f"Movies of {actor_name}:")
movies

In [None]:
def get_movies_by_actor_pair(actor1, actor2, df):
    # Filter rows where both actors appear in the same movie
    movies_together = df[df['actor_name'].isin([actor1, actor2])]
    movies_together = movies_together.groupby('tconst').filter(lambda x: len(x['actor_name'].unique()) > 1)
    return movies_together[['tconst', 'movie_title', 'average_rating']].drop_duplicates()

# Example usage
actor1 = "Eric Roberts"
actor2 = "John Schneider"
# Get movies where both actors appeared together
movies = get_movies_by_actor_pair(actor1, actor2, df)
print(movies)

In [None]:
edge_rows = []

for i in range(len(filtered_edges)):
    u, v = filtered_edges[i]
    value = filtered_weights[i]
    edge_rows.append(f"{u};{v};{value}")

with open("edges.csv", "w", encoding="utf-8") as f:
    f.write("source;target;value\n")
    f.write("\n".join(edge_rows))

node_rows = []

for node, data in ego_graph.nodes(data=True):
    category = data.get('tier_label_comb')
    name = data.get('name')
    node_rows.append(f"{node};{name};{category}")

with open("nodes.csv", "w", encoding="utf-8") as f:
    f.write("id;name;category\n")
    f.write("\n".join(node_rows))

In [None]:
from IPython.display import Image
Image(filename='eric roberts ego graph.png')

## Star power graph

In [None]:
import networkx as nx
from itertools import combinations

G_starPower = nx.Graph()

for tconst, group in df.groupby('tconst'):
    actors = group['actor_name'].tolist()
    avg_rating = group['average_rating'].iloc[0]
    num_votes = group['num_votes'].iloc[0] 
    for actor1, actor2 in combinations(actors, 2):
        if actor1 == actor2:
            continue
        # Add nodes excplicitly to add name attributes
        if not G_starPower.has_node(actor1):
            G_starPower.add_node(actor1, name=actor1)
        if not G_starPower.has_node(actor2):
            G_starPower.add_node(actor2, name=actor2)
        # Add edge with weight
        if G_starPower.has_edge(actor1, actor2):
            # Update total weighted rating and movie count
            G_starPower[actor1][actor2]['movie_count'] += 1
            # Update the weighted average rating
            G_starPower[actor1][actor2]['weight'] += avg_rating * num_votes
        else:
             # Initialize edge attributes
            G_starPower.add_edge(
                actor1, actor2,
                movie_count=1,
                weight=avg_rating * num_votes 
            )

In [None]:
pagerank = nx.pagerank(G_starPower, weight='weight')
nx.set_node_attributes(G_starPower, pagerank, name='pagerank')

eigen_centrality = nx.eigenvector_centrality(G_starPower, max_iter=1000, weight='weight')
nx.set_node_attributes(G_starPower, eigen_centrality, name='eigen_centrality')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

centrality_df = pd.DataFrame.from_dict(pagerank, orient='index', columns=['pagerank'])
centrality_df['eigenvector'] = pd.Series(eigen_centrality)
# Scale pagerank to range [0, 0.5]
pagerank_scaler = MinMaxScaler(feature_range=(0, 0.5))
centrality_df['pagerank_scaled'] = pagerank_scaler.fit_transform(centrality_df[['pagerank']])
# Scale eigenvector centrality to range [0, 0.5]
eigenvector_scaler = MinMaxScaler(feature_range=(0, 0.5))
centrality_df['eigenvector_scaled'] = eigenvector_scaler.fit_transform(centrality_df[['eigenvector']])
# Combine scaled pagerank and eigenvector centrality
centrality_df['page_plus_eigen_scaled'] = centrality_df['pagerank_scaled'] + centrality_df['eigenvector_scaled']

degree_dict = dict(G_starPower.degree())  # Unweighted degree — just the number of neighbors
centrality_df['degree'] = pd.Series(degree_dict)

scaler = StandardScaler()
# KMEANS clustering on pagerank
X = scaler.fit_transform(centrality_df[['pagerank']])
kmeans = KMeans(n_clusters=4, random_state=42)
centrality_df['tier_page'] = kmeans.fit_predict(X)

# KMEANS clustering on eigenvector centrality
scaler = StandardScaler()
X = scaler.fit_transform(centrality_df[['eigenvector']])
kmeans = KMeans(n_clusters=4, random_state=42)
centrality_df['tier_eigen'] = kmeans.fit_predict(X)

#KMEANS clustering on pagerank + eigenvector centrality
scaler = StandardScaler()
X = scaler.fit_transform(centrality_df[['page_plus_eigen_scaled']])
kmeans = KMeans(n_clusters=4, random_state=42)
centrality_df['tier_combined'] = kmeans.fit_predict(X)

# Map sorted cluster index to labels to have meaningful names
labels = ['A-list', 'B-list', 'C-list', 'D-list']
tier_means = centrality_df.groupby('tier_page')['pagerank'].mean().sort_values(ascending=False)
tier_to_label = {tier: label for tier, label in zip(tier_means.index, labels)}

centrality_df['tier_label_pr'] = centrality_df['tier_page'].map(tier_to_label)
nx.set_node_attributes(G_starPower, centrality_df['tier_label_pr'].to_dict(), name='tier_label_pr')

labels = ['A-list', 'B-list', 'C-list', 'D-list']
tier_means = centrality_df.groupby('tier_eigen')['eigenvector'].mean().sort_values(ascending=False)
tier_to_label = {tier: label for tier, label in zip(tier_means.index, labels)}

centrality_df['tier_label_ev'] = centrality_df['tier_eigen'].map(tier_to_label)
nx.set_node_attributes(G_starPower, centrality_df['tier_label_ev'].to_dict(), name='tier_label_ev')

labels = ['A-list', 'B-list', 'C-list', 'D-list']
tier_means = centrality_df.groupby('tier_combined')['page_plus_eigen_scaled'].mean().sort_values(ascending=False)
tier_to_label = {tier: label for tier, label in zip(tier_means.index, labels)}

centrality_df['tier_label_comb'] = centrality_df['tier_combined'].map(tier_to_label)
nx.set_node_attributes(G_starPower, centrality_df['tier_label_comb'].to_dict(), name='tier_label_comb')

In [None]:
# Merge the centrality_df with the original df to include average_rating
df_with_tiers = df.copy().merge(centrality_df[['tier_label_comb']], left_on='actor_name', right_index=True, how='inner')

# Group by tier and calculate the average rating
tier_avg_rating = df_with_tiers.groupby('tier_label_comb')['average_rating'].mean()

# Print the average rating for each tier
print("Average rating by tier:")
print(tier_avg_rating)

In [None]:
centrality_df['name'] = centrality_df.index.map(lambda node: G_starPower.nodes[node]['name'])

for tier in sorted(centrality_df['tier_label_pr'].unique()):
    sub = centrality_df[centrality_df['tier_label_pr'] == tier]
    print(f"Tier based on PageRank {tier}:")
    print("  Avg PageRank:", round(sub['pagerank'].mean(), 7))
    print("  Avg Eigenvector:", round(sub['eigenvector'].mean(), 7))
    print("  Avg Page + Eigen:", round(sub['page_plus_eigen_scaled'].mean(), 4))
    print("  Avg Nr of Degrees:", round(sub['degree'].mean(), 2))
    print("  Median Nr of Degrees:", round(sub['degree'].median(), 2))
    print("  Nr of actors/actresses:", len(sub))
    print("  Sample actors:", [a for a in sub['name'].head(5)])

print('-------------------------------------')
for tier in sorted(centrality_df['tier_label_ev'].unique()):
    sub = centrality_df[centrality_df['tier_label_ev'] == tier]
    print(f"Tier based on Eigenvector {tier}:")
    print("  Avg PageRank:", round(sub['pagerank'].mean(), 7))
    print("  Avg Eigenvector:", round(sub['eigenvector'].mean(), 7))
    print("  Avg Page + Eigen:", round(sub['page_plus_eigen_scaled'].mean(), 4))
    print("  Avg Nr of Degrees:", round(sub['degree'].mean(), 2))
    print("  Median Nr of Degrees:", round(sub['degree'].median(), 2))
    print("  Nr of actors/actresses:", len(sub))
    print("  Sample actors:", [a for a in sub['name'].head(5)])

print('-------------------------------------')
for tier in sorted(centrality_df['tier_label_comb'].unique()):
    sub = centrality_df[centrality_df['tier_label_comb'] == tier]
    print(f"Tier based PageRank & Eigenvector {tier}:")
    print("  Avg PageRank:", round(sub['pagerank'].mean(), 7))
    print("  Avg Eigenvector:", round(sub['eigenvector'].mean(), 7))
    print("  Avg Page + Eigen:", round(sub['page_plus_eigen_scaled'].mean(), 4))
    print("  Median Nr of Degrees:", round(sub['degree'].median(), 2))
    print("  Avg Nr of Degrees:", round(sub['degree'].mean(), 2))
    print("  Nr of actors/actresses:", len(sub))
    print("  Sample actors:", [a for a in sub['name'].head(5)])

In [None]:
print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'A-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'B-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'C-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

print(centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'D-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(3).index)

In [None]:
import matplotlib.pyplot as plt

top_a_list_actors_comb = centrality_df.loc[
    (centrality_df['tier_label_comb'] == 'A-list')
].sort_values('page_plus_eigen_scaled', ascending=False).head(5).index
print(f"Top A-list actor: {centrality_df.loc[top_a_list_actors_comb[0]]}")
# Expand the subgraph to include all neighbors of the top A-list actors
expanded_nodes = set()
filtered_edges = []
filtered_weights = []

for actor in top_a_list_actors_comb:
    ego_graph = nx.ego_graph(G_starPower, actor, radius=1, center=True, undirected=True)
    expanded_nodes.update(ego_graph.nodes)
    # Filter edges to include only those connected to the top A-list actors
    for neighbor in ego_graph.neighbors(actor):
        filtered_edges.append((actor, neighbor))
        filtered_weights.append(ego_graph[actor][neighbor]['weight'])

# Create a subgraph with the expanded nodes
H = G_starPower.copy().subgraph(expanded_nodes)


plt.figure(figsize=(25, 20))
pos = nx.random_layout(H, seed=42)
# Define a color map for tiers
tier_colors = {
    'A-list': 'gold',
    'B-list': 'silver',
    'C-list': 'lightblue',
    'D-list': 'gray'
}
# Get the tier for each node
node_colors = [
    tier_colors[centrality_df.loc[node, 'tier_label_comb']]
    if node in centrality_df.index else 'gray'
    for node in H.nodes()
]
options = {
    "node_color": node_colors,
    "edge_color": filtered_weights,
    "edgelist": filtered_edges,
    "edge_cmap": plt.cm.Reds,
    "with_labels": True,
    "labels": {n: G_starPower.nodes[n]['name'] for n in H.nodes()},
    "font_size": 10
}
nx.draw(H, pos, **options)
plt.title("A-list Actors' Closest Neighbors (Tier Colors)")
plt.show()


In [None]:
# Saving edges and nodes to CSV files for visualization
edge_rows = []

for i in range(len(filtered_edges)):
    u, v = filtered_edges[i]
    value = filtered_weights[i] 
    edge_rows.append(f"{u};{v};{value}")

with open("edges.csv", "w", encoding="utf-8") as f:
    f.write("source;target;value\n")
    f.write("\n".join(edge_rows))

node_rows = []

for node, data in H.nodes(data=True):
    category = data.get('tier_label_comb')
    name = data.get('name')
    node_rows.append(f"{node};{name};{category}")

with open("nodes.csv", "w", encoding="utf-8") as f:
    f.write("id;name;category\n")
    f.write("\n".join(node_rows))

In [None]:
from IPython.display import Image
Image(filename='star power top5 graph.PNG')

In [None]:
movies = get_movies_by_actor_pair("Dee Wallace", "Robert Miano", df)
print(movies)

In [None]:
# Group by actor_name and calculate the average rating
actor_avg_ratings = df.groupby('actor_name')['average_rating'].mean()

# Find the actor with the maximum average rating
max_rating_actor = actor_avg_ratings.idxmax()  # Actor with the highest average rating
max_rating = actor_avg_ratings.max()  # Maximum average rating

print(f"Actor with the highest average rating: {max_rating_actor} ({max_rating:.2f}), nr of movies: {len(df[df['actor_name'] == max_rating_actor])}")
print(f"Actor's tier based on PageRank and Eigenvector: {centrality_df.loc[max_rating_actor, 'tier_label_comb']}")

In [None]:
def get_actor_average_rating(actor_name, df):
    # Filter the DataFrame for movies where the actor appears
    actor_movies = df[df['actor_name'] == actor_name]
    
    # Calculate the average rating
    avg_rating = actor_movies['average_rating'].mean()
    
    # Count the number of movies
    num_movies = actor_movies['tconst'].nunique() 
    
    return avg_rating, num_movies

actor_name = "Scarlett Johansson"
avg_rating, num_movies = get_actor_average_rating(actor_name, df)

print(f"Average rating of movies for {actor_name}: {avg_rating:.2f}")
print(f"Number of movies for {actor_name}: {num_movies}")

## Bipartite graph between actors and genres

In [None]:
import networkx as nx

# Create a bipartite graph
B = nx.Graph()

# Iterate through the DataFrame to add nodes and edges
for _, row in df.iterrows():
    actor = row['actor_name']
    genres = row['genres']
    movie_title = row['movie_title']
    avg_rating = row['average_rating']
    num_votes = row['num_votes']
    
    # Add actor node (part of set 0)
    if not B.has_node(actor):
        B.add_node(actor, bipartite=0, type='actor')  # Add type attribute for clarity
    
    # Add genre nodes (part of set 1) and edges
    for genre in genres:
        if not B.has_node(genre):
            B.add_node(genre, bipartite=1, type='genre')  # Add type attribute for clarity
        
        # Add edge between actor and genre with movie attributes
        if not B.has_edge(actor, genre):
            B.add_edge(actor, genre, movies=[movie_title], total_rating=avg_rating, total_votes=num_votes, weight=avg_rating * num_votes)
        else:
            # Update edge attributes if the edge already exists
            B[actor][genre]['movies'].append(movie_title)
            B[actor][genre]['total_rating'] += avg_rating
            B[actor][genre]['total_votes'] += num_votes
            B[actor][genre]['weight'] = B[actor][genre]['total_rating'] * B[actor][genre]['total_votes'] * len(B[actor][genre]['movies'])

print(f"Number of nodes: {B.number_of_nodes()}")
print(f"Number of edges: {B.number_of_edges()}")

In [None]:
actor_movie_counts = df.groupby('actor_name')['tconst'].nunique()  # Count unique movies per actor
actors_with_multiple_movies = actor_movie_counts[actor_movie_counts > 5].index  # Get actors with more than 5 movie

# Filter the bipartite graph to include only these actors
filtered_B = B.copy().subgraph([node for node in B.nodes if B.nodes[node]['type'] == 'genre' or node in actors_with_multiple_movies])

# Filter actor nodes (bipartite=0) and calculate their degree (connections to genres)
actor_genre_degrees = {
    node: len([neighbor for neighbor in filtered_B.neighbors(node) if filtered_B.nodes[neighbor]['type'] == 'genre'])
    for node, data in filtered_B.nodes(data=True) if data['type'] == 'actor'
}

# Sort actors by the number of genre connections
sorted_actor_genre_degrees = sorted(actor_genre_degrees.items(), key=lambda x: x[1], reverse=True)

# Get the top 5 actors with the most genre connections
top_5_most_genres = sorted_actor_genre_degrees[:5]

# Get the top 5 actors with the least genre connections
top_5_least_genres = sorted_actor_genre_degrees[-5:]

print("Top 5 actors with the most genre connections:")
for actor, count in top_5_most_genres:
    print(f"{actor}: {count} genres")

print("\nTop 5 actors with the least genre connections:")
for actor, count in top_5_least_genres:
    print(f"{actor}: {count} genres")

In [None]:
#df.drop(columns=['weighted_avg_rating'], inplace=True)
df['weighted_avg_rating'] = df['average_rating'] * df['num_votes']  # Weighted rating
# Calculate the average weighted rating for each actor
actor_avg_ratings = df.groupby('actor_name')[['weighted_avg_rating', 'average_rating']].mean()
# Sort actors by their average rating in descending order
sorted_actor_avg_ratings = actor_avg_ratings.sort_values(by='weighted_avg_rating', ascending=False)['average_rating']
# Get the top 5 highest-rated actors
top_5_highest_rated_actors = sorted_actor_avg_ratings.head(5)

print("Top 5 highest-rated actors across all genres:")
for actor, avg_rating in top_5_highest_rated_actors.items():
    print(f"{actor}: {avg_rating:.2f}")

print("\nTop 5 lowest-rated actors across all genres:")
for actor, avg_rating in sorted_actor_avg_ratings[-5:].items():
    print(f"{actor}: {avg_rating:.2f}")

In [None]:
def get_actor_best_and_worst_genres(actor_name, df):
    # Filter the DataFrame for movies where the actor appears
    actor_movies = df[df['actor_name'] == actor_name]
    
    # Dictionary to store average ratings for each genre
    genre_ratings = defaultdict(list)
    
    # Iterate through the actor's movies and collect ratings for each genre
    for _, row in actor_movies.iterrows():
        genres = row['genres']
        for genre in genres:
            genre_ratings[genre].append(row['average_rating'])
    
    # Calculate the average rating for each genre
    avg_genre_ratings = {genre: sum(ratings) / len(ratings) for genre, ratings in genre_ratings.items()}
    
    # Find the best and worst genres
    best_genre = max(avg_genre_ratings, key=avg_genre_ratings.get)
    worst_genre = min(avg_genre_ratings, key=avg_genre_ratings.get)
    
    return best_genre, avg_genre_ratings[best_genre], worst_genre, avg_genre_ratings[worst_genre]

actor_name = "Scarlett Johansson"
best_genre, best_rating, worst_genre, worst_rating = get_actor_best_and_worst_genres(actor_name, df)

print(f"Best genre for {actor_name}: {best_genre} ({best_rating:.2f})")
print(f"Worst genre for {actor_name}: {worst_genre} ({worst_rating:.2f})")

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

actor_movie_counts = df.groupby('actor_name')['tconst'].nunique()  # Count unique movies per actor

filtered_B = B.copy().subgraph([node for node in B.nodes if B.nodes[node]['type'] == 'genre' or node in actor_movie_counts])
actor_genre_degrees = {
    node: len([neighbor for neighbor in filtered_B.neighbors(node) if filtered_B.nodes[neighbor]['type'] == 'genre'])
    for node, data in filtered_B.nodes(data=True) if data['type'] == 'actor'
}
# Count the frequency of genre connections
genre_connection_counts = Counter(actor_genre_degrees.values())

# Sort the counts by the number of genre connections
sorted_genre_counts = sorted(genre_connection_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# Separate the data for plotting
genre_counts, actor_frequencies = zip(*sorted_genre_counts)
# Calculate percentages
total_actors = sum(actor_frequencies)
actor_percentages = [(freq / total_actors) * 100 for freq in actor_frequencies]

# Plot the bar chart
plt.figure(figsize=(9, 6))
bars = plt.bar(genre_counts, actor_percentages, color='skyblue', edgecolor='black', alpha=0.7)
# Add percentage labels on top of each bar
for bar, percentage in zip(bars, actor_percentages):
    plt.text(
        bar.get_x() + bar.get_width() / 2,  # X-coordinate (center of the bar)
        bar.get_height() + 0.5,            # Y-coordinate (slightly above the bar)
        f"{percentage:.2f}%",             # Text to display
        ha='center', va='bottom', fontsize=14  # Center alignment
    )

plt.title("Distribution of Top 10 Genre Connections Among Actors", fontsize=18)
plt.xlabel("Number of Genre Connections", fontsize=14)
plt.ylabel("Percentage of Actors (%)", fontsize=14)
plt.xticks(range(min(genre_counts), 11), fontsize=14)
plt.yticks(fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
def get_actor_genre_connections(actor_name, graph):
    # Check if the actor exists in the graph
    if actor_name not in graph:
        print(f"Actor '{actor_name}' not found in the graph.")
        return []
    
    # Get all neighbors of the actor and filter for genres
    genre_connections = [neighbor for neighbor in graph.neighbors(actor_name) if graph.nodes[neighbor]['type'] == 'genre']
    
    return genre_connections

actor_name = "Eric Roberts"
genre_connections = get_actor_genre_connections(actor_name, B)

print(f"Genres connected to {actor_name}: {genre_connections}")

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Count the occurrences of each genre
genre_counts = Counter()
for genres in df['genres']:
    genre_counts.update(genres)

# Sort genres by their counts
sorted_genre_counts = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:10]  # Get top 10 genres

# Separate data for plotting
genres, counts = zip(*sorted_genre_counts)
# Calculate percentages
total_genres = sum(counts)
genre_percentages = [(freq / total_genres) * 100 for freq in counts]

# Plot the bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(genres, genre_percentages, color='skyblue', edgecolor='black', alpha=0.7)
for bar, percentage in zip(bars, genre_percentages):
    plt.text(
        bar.get_x() + bar.get_width() / 2,  # X-coordinate (center of the bar)
        bar.get_height() + 0.5,            # Y-coordinate (slightly above the bar)
        f"{percentage:.2f}%",             # Text to display
        ha='center', va='bottom', fontsize=12  # Center alignment
    )
plt.title("Popularity of Genres Top 10", fontsize=18)
plt.xlabel("Genres", fontsize=14)
plt.ylabel("Percentage of Appearances (%)", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)  # Rotate x-axis labels for better readability
plt.yticks(fontsize=12) 
plt.tight_layout()
plt.show()

In [None]:
genre_nodes = [n for n, d in B.nodes(data=True) if d.get('type') == 'genre']

# For each genre, collect connected actors and their edge weights
top_actors_per_genre = {}

for genre in genre_nodes:
    neighbors = B.neighbors(genre)
    actor_movie_counts = []

    for actor in neighbors:
        edge_data = B.get_edge_data(genre, actor)
        movies = edge_data.get('movies', [])  # Ensure it's a list
        movie_count = len(movies)
        actor_movie_counts.append((actor, movie_count, movies))

    # Sort actors by movie count and get top 5
    top_5 = sorted(actor_movie_counts, key=lambda x: x[1], reverse=True)[:5]
    top_actors_per_genre[genre] = top_5

for genre, actor_info in top_actors_per_genre.items():
    print(f"\n Top 5 actors for genre '{genre}':")
    for actor, count, movies in actor_info:
        print(f"  {actor} — {count} movies")

In [None]:
def find_niche_actors_with_graph(graph):
    # List to store niche actors
    niche_actors = []

    # Filter actor nodes
    actor_nodes = [node for node, data in graph.nodes(data=True) if data['type'] == 'actor']

    for actor in actor_nodes:
        # Get all genres connected to the actor
        genre_edges = [(genre, graph[actor][genre]) for genre in graph.neighbors(actor) if graph.nodes[genre]['type'] == 'genre']

        # Calculate the total number of movies for the actor
        total_movies = sum(len(edge_data['movies']) for _, edge_data in genre_edges)
        if total_movies <= 10: # to filter out actors with too few movies
            continue
        # Count the number of movies for each genre
        genre_counts = {genre: len(edge_data['movies']) for genre, edge_data in genre_edges}

        # Find the most frequent genre and its count
        most_frequent_genre = max(genre_counts, key=genre_counts.get)
        most_frequent_genre_count = genre_counts[most_frequent_genre]

        # Check if the most frequent genre accounts for at least 75% of the movies
        if most_frequent_genre_count > total_movies * 0.75:
            niche_actors.append({
                'actor': actor,
                'total_movies': total_movies,
                'most_frequent_genre': most_frequent_genre,
                'genre_count': most_frequent_genre_count,
                'ratio': most_frequent_genre_count / total_movies
            })

    return sorted(niche_actors, key=lambda x: x['ratio'], reverse=True)

niche_actors = find_niche_actors_with_graph(B.copy())

print("Niche actors:")
for actor_info in niche_actors:
    print(f"Actor: {actor_info['actor']}, Total Movies: {actor_info['total_movies']}, "
          f"Most Frequent Genre: {actor_info['most_frequent_genre']} ({actor_info['genre_count']} movies) {actor_info['ratio']:.2%} of all movies)")

### Clustering on bipartite graph

In [None]:
from sklearn.cluster import SpectralCoclustering
import networkx as nx
import numpy as np

# Filter actor and genre nodes
actor_nodes = [node for node, data in B.nodes(data=True) if data['type'] == 'actor']
genre_nodes = [node for node, data in B.nodes(data=True) if data['type'] == 'genre']

# Create the bipartite adjacency matrix
adj_matrix = nx.bipartite.biadjacency_matrix(B, row_order=actor_nodes, column_order=genre_nodes)

# Apply Spectral Co-clustering
model = SpectralCoclustering(n_clusters=9, random_state=0)  # Adjust n_clusters as needed
model.fit(adj_matrix)

# Extract clusters
actor_clusters = model.row_labels_  # Cluster labels for actors
genre_clusters = model.column_labels_  # Cluster labels for genres

# Map clusters back to actor and genre names
actor_cluster_mapping = {actor: cluster for actor, cluster in zip(actor_nodes, actor_clusters)}
genre_cluster_mapping = {genre: cluster for genre, cluster in zip(genre_nodes, genre_clusters)}

'''
print("Actor Clusters:")
for actor, cluster in actor_cluster_mapping.items():
    print(f"  Actor: {actor}, Cluster: {cluster}")

print("\nGenre Clusters:")
for genre, cluster in genre_cluster_mapping.items():
    print(f"  Genre: {genre}, Cluster: {cluster}")
'''

In [None]:
from collections import defaultdict

# Group actors and genres by cluster
actor_clusters_by_id = defaultdict(list)
genre_clusters_by_id = defaultdict(list)

for actor, cluster in actor_cluster_mapping.items():
    actor_clusters_by_id[cluster].append(actor)

for genre, cluster in genre_cluster_mapping.items():
    genre_clusters_by_id[cluster].append(genre)

for i in range(9):
    print(f"\nCluster {i}:")
    print(f"  Actors: {actor_clusters_by_id[i][:5]}")  # Show only top 5 actors in each cluster
    print(f"  Genres: {genre_clusters_by_id[i]}")

In [None]:
!pip install scipy

In [None]:
import networkx as nx
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix
from collections import defaultdict

# Step 1: Filter actor and genre nodes
actor_nodes = [node for node, data in B.nodes(data=True) if data['type'] == 'actor']
genre_nodes = [node for node, data in B.nodes(data=True) if data['type'] == 'genre']
#actors_with_enough_movies = [actor for actor in actor_nodes if B.degree(actor) >= 5]
# Step 2: Create the bipartite adjacency matrix
adj_matrix = nx.bipartite.biadjacency_matrix(B, row_order=actor_nodes, column_order=genre_nodes)
# Choose number of clusters
n_clusters = 9

# Run NMF
nmf = NMF(n_components=n_clusters, init='nndsvda', random_state=42)
W = nmf.fit_transform(adj_matrix)        # W: actors × components
H = nmf.components_                      # H: components × genres

# Assign actors and genres to clusters
actor_clusters = W.argmax(axis=1)
genre_clusters = H.argmax(axis=0)

# Map back to names
actor_cluster_mapping_nmf = {actor: cluster for actor, cluster in zip(actor_nodes, actor_clusters)}
genre_cluster_mapping_nmf = {genre: cluster for genre, cluster in zip(genre_nodes, genre_clusters)}

'''
actor_total_ratings = {}
for actor in actor_nodes:
    total_rating = sum(B[actor][neighbor].get('weighted_avg_rating', 0) for neighbor in B.neighbors(actor))
    actor_total_ratings[actor] = total_rating
'''
# Dictionary to store weighted average ratings for each actor by genre
actor_genre_ratings = defaultdict(dict)

# Iterate over all actor nodes
for actor in actor_nodes:
    for genre in B.neighbors(actor):
        if B.nodes[genre]['type'] == 'genre':  # Ensure the neighbor is a genre
            edge_data = B.get_edge_data(actor, genre)
            actor_genre_ratings[actor][genre] = edge_data.get('weighted_avg_rating', 0)  # Weighted average rating from edge attributes

# Group actors and genres by cluster
actor_clusters_by_id = defaultdict(list)
genre_clusters_by_id = defaultdict(list)

for genre, cluster in genre_cluster_mapping_nmf.items():
    genre_clusters_by_id[cluster].append(genre)

# Group actors by cluster and calculate the max weighted average rating for genres in the cluster
for actor, cluster in actor_cluster_mapping_nmf.items():
    # Get the genres in the same cluster
    genres_in_cluster = genre_clusters_by_id[cluster]
    
    # Calculate the max weighted average rating for the actor across genres in the cluster
    max_weighted_avg_rating = 0
    for genre in genres_in_cluster:
        if genre in actor_genre_ratings[actor]:  # Check if the actor has a rating for this genre
            max_weighted_avg_rating = max(max_weighted_avg_rating, actor_genre_ratings[actor][genre])
    
    # Append the actor and their max weighted average rating to the cluster
    actor_clusters_by_id[cluster].append((actor, max_weighted_avg_rating))

# Sort actors within each cluster by max weighted average rating (descending)
for cluster_id in actor_clusters_by_id:
    actor_clusters_by_id[cluster_id].sort(key=lambda x: x[1], reverse=True)

# Print
for i in range(n_clusters):
    print(f"\nCluster {i}:")
    print(f"  Top 10 Actors: {[actor for actor, _ in actor_clusters_by_id[i][:10]]}")  # Show only top 5 actors in each cluster
    print(f"  Genres: {genre_clusters_by_id[i]}") 

In [None]:
from sklearn.cluster import SpectralBiclustering

# Choose number of clusters
n_clusters = 9
model = SpectralBiclustering(n_clusters=n_clusters,  random_state=0)
model.fit(adj_matrix)  # Convert sparse matrix to dense

# actor → row labels
# genre → column labels
actor_clusters_biclust = model.row_labels_
genre_clusters_biclust = model.column_labels_

# Map back to names
actor_cluster_mapping_biclust = {actor: cluster for actor, cluster in zip(actor_nodes, actor_clusters_biclust)}
genre_cluster_mapping_biclust = {genre: cluster for genre, cluster in zip(genre_nodes, genre_clusters_biclust)}

# Group actors and genres by cluster
actor_clusters_by_id = defaultdict(list)
genre_clusters_by_id = defaultdict(list)

for genre, cluster in genre_cluster_mapping_biclust.items():
    genre_clusters_by_id[cluster].append(genre)

# Group actors by cluster and calculate the max weighted average rating for genres in the cluster
for actor, cluster in actor_cluster_mapping_biclust.items():
    # Get the genres in the same cluster
    genres_in_cluster = genre_clusters_by_id[cluster]
    
    # Calculate the max weighted average rating for the actor across genres in the cluster
    max_weighted_avg_rating = 0
    for genre in genres_in_cluster:
        if genre in actor_genre_ratings[actor]:  # Check if the actor has a rating for this genre
            max_weighted_avg_rating = max(max_weighted_avg_rating, actor_genre_ratings[actor][genre])
    
    # Append the actor and their max weighted average rating to the cluster
    actor_clusters_by_id[cluster].append((actor, max_weighted_avg_rating))

# Sort actors within each cluster by max weighted average rating (descending)
for cluster_id in actor_clusters_by_id:
    actor_clusters_by_id[cluster_id].sort(key=lambda x: x[1], reverse=True)

# Print
for i in range(n_clusters):
    print(f"\nCluster {i}:")
    print(f"  Top 10 Actors: {[actor for actor, _ in actor_clusters_by_id[i][:10]]}")  # Show only top 5 actors in each cluster
    print(f"  Genres: {genre_clusters_by_id[i]}") 

In [None]:
print(B.get_edge_data('Jon Bernthal', 'Drama'))
print(B.get_edge_data('Lily James', 'Drama'))