In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import gzip
import shutil
import os # For checking file existence

In [2]:
# --- 0. Data Loading and Preprocessing from IMDb ---
# Replace this list with the exact titles of the movies you are interested in.
# Case sensitivity matters, so try to match IMDb's primary titles exactly.
TARGET_MOVIES = [
    "Robin Hood"
]

IMDB_DATA_DIR = '../data/'
TITLE_BASICS_TSV = os.path.join(IMDB_DATA_DIR, 'title.basics.tsv')
TITLE_PRINCIPALS_TSV = os.path.join(IMDB_DATA_DIR, 'title.principals.tsv')
NAME_BASICS_TSV = os.path.join(IMDB_DATA_DIR, 'name.basics.tsv')

# Function to safely load TSV files into Pandas DataFrames
def load_imdb_tsv(filepath):
    """Loads a TSV file into a pandas DataFrame."""
    try:
        df = pd.read_csv(filepath, sep='\t', low_memory=False)
        # Replace '\N' (IMDb's null representation) with actual NaN
        df = df.replace('\\N', pd.NA)
        print(f"Loaded {os.path.basename(filepath)}: {len(df)} rows")
        return df
    except FileNotFoundError:
        print(f"Error: {os.path.basename(filepath)} not found. Please download it from https://datasets.imdbws.com/ and place it in {IMDB_DATA_DIR}")
        print("Note: If you downloaded .gz files, they might have been automatically uncompressed to .tsv files.")
        return None
    except Exception as e:
        print(f"Error loading {os.path.basename(filepath)}: {e}")
        return None

In [None]:
print("--- Loading IMDb Datasets ---")
titles_df = load_imdb_tsv(TITLE_BASICS_TSV)
principals_df = load_imdb_tsv(TITLE_PRINCIPALS_TSV)
names_df = load_imdb_tsv(NAME_BASICS_TSV)

--- Loading IMDb Datasets ---
Loaded title.basics.tsv: 11819909 rows


In [5]:
# Check if all dataframes loaded successfully
if titles_df is None or principals_df is None or names_df is None:
    print("Exiting due to missing IMDb data files. Please ensure they are downloaded and placed correctly.")
    exit() # Exit the script if data is missing

print("\n--- Preprocessing Data for Network Analysis ---")

# 1. Filter titles_df for movies and your TARGET_MOVIES
# Ensure 'primaryTitle' is string type for comparison
titles_df['primaryTitle'] = titles_df['primaryTitle'].astype(str)
target_movies_df = titles_df[
    (titles_df['titleType'] == 'movie') &
    (titles_df['primaryTitle'].isin(TARGET_MOVIES))
].copy() # Use .copy() to avoid SettingWithCopyWarning

if target_movies_df.empty:
    print(f"No movies found matching your TARGET_MOVIES list: {TARGET_MOVIES}. Please check titles.")
    exit()

print(f"Found {len(target_movies_df)} target movies.")

# 2. Filter principals_df for actors/actresses and merge with target_movies_df
# Filter for actors and actresses only
actors_principals_df = principals_df[
    principals_df['category'].isin(['actor', 'actress'])
].copy()

# Merge to link target movies with their principal actors
# We use 'inner' merge to only keep records where both movie and principal exist
movie_actor_links_df = pd.merge(
    target_movies_df[['tconst', 'primaryTitle']],
    actors_principals_df[['tconst', 'nconst']],
    on='tconst',
    how='inner'
)

if movie_actor_links_df.empty:
    print("No actor links found for the target movies. This might indicate an issue with the data or filtering.")
    exit()

print(f"Found {len(movie_actor_links_df)} movie-actor links for target movies.")

# 3. Merge with names_df to get actor names
# Ensure 'nconst' in names_df is string type for merging
names_df['nconst'] = names_df['nconst'].astype(str)
final_movie_actor_data = pd.merge(
    movie_actor_links_df,
    names_df[['nconst', 'primaryName']],
    on='nconst',
    how='inner'
)

if final_movie_actor_data.empty:
    print("No final movie-actor data after merging with actor names. Check your nconsts.")
    exit()

print(f"Successfully compiled actor data for {len(final_movie_actor_data['primaryTitle'].unique())} movies.")

# --- 1. Define your data (Dynamically generated from IMDb data) ---
# Create the movie_data dictionary in the format networkx expects
movie_data = {}
for _, row in final_movie_actor_data.iterrows():
    movie_title = row['primaryTitle']
    actor_name = row['primaryName']
    if movie_title not in movie_data:
        movie_data[movie_title] = []
    if actor_name not in movie_data[movie_title]: # Avoid duplicate actors for a movie
        movie_data[movie_title].append(actor_name)

print("\n--- Generated movie_data dictionary (first 3 entries) ---")
for i, (movie, actors) in enumerate(movie_data.items()):
    if i >= 3:
        break
    print(f"{movie}: {actors[:5]}...") # Print only first 5 actors for brevity
print("-" * 40)


# --- 2. Create a Bipartite Graph (Actors and Movies) ---
# A bipartite graph has two distinct sets of nodes (e.g., actors and movies)
# where edges only connect nodes from different sets.
B = nx.Graph()

# Add movie nodes with an attribute to identify them as 'movie'
for movie in movie_data.keys():
    B.add_node(movie, bipartite=0, type='movie') # bipartite=0 for movies

# Add actor nodes with an attribute to identify them as 'actor'
# And add edges between movies and actors
actors = set()
for movie, actor_list in movie_data.items():
    for actor in actor_list:
        actors.add(actor)
        B.add_node(actor, bipartite=1, type='actor') # bipartite=1 for actors
        B.add_edge(movie, actor)

print("--- Bipartite Graph (Actors & Movies) ---")
print(f"Number of nodes: {B.number_of_nodes()}")
print(f"Number of edges: {B.number_of_edges()}")
print("-" * 40)

# --- 3. Visualize the Bipartite Graph ---
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(B, iterations=50, seed=42) # Layout algorithm for positioning nodes

# Separate nodes for coloring
movie_nodes = [n for n, d in B.nodes(data=True) if d['bipartite'] == 0]
actor_nodes = [n for n, d in B.nodes(data=True) if d['bipartite'] == 1]

# Draw movie nodes (e.g., green circles)
nx.draw_nodes(B, pos, nodelist=movie_nodes, node_color='lightgreen', node_size=2000,
              alpha=0.9, label='Movies', edgecolors='black', linewidths=1)
nx.draw_labels(B, pos, font_size=9, font_weight='bold', font_color='black')

# Draw actor nodes (e.g., light blue squares)
nx.draw_nodes(B, pos, nodelist=actor_nodes, node_color='skyblue', node_shape='s', node_size=1500,
              alpha=0.9, label='Actors', edgecolors='black', linewidths=1)
nx.draw_labels(B, pos, font_size=8, font_color='darkblue')

# Draw edges
nx.draw_edges(B, pos, width=1.0, alpha=0.5, edge_color='gray')

plt.title("Bipartite Graph: Actors and Movies", size=16)
plt.legend(handles=[
    plt.Line2D([0], [0], marker='o', color='w', label='Movie', markersize=10, markerfacecolor='lightgreen', markeredgecolor='black'),
    plt.Line2D([0], [0], marker='s', color='w', label='Actor', markersize=10, markerfacecolor='skyblue', markeredgecolor='black')
])
plt.axis('off') # Hide axes
plt.show()

# --- 4. Create a Projected Graph (Actor-Actor Connections) ---
# In this graph, two actors are connected if they appeared in at least one common movie.
# We can project the bipartite graph onto the 'actors' set.
# This means we'll create a new graph where only actor nodes exist,
# and an edge connects two actors if they share a movie in the original bipartite graph.
G_actors = nx.bipartite.project(B, actor_nodes)

print("\n--- Projected Graph (Actor-Actor Connections) ---")
print(f"Number of nodes (actors): {G_actors.number_of_nodes()}")
print(f"Number of edges (shared movies): {G_actors.number_of_edges()}")
print("-" * 40)

# Add a 'weight' attribute to edges, representing the number of shared movies
# (This is automatically handled by project if you pass the correct nodes)
# Let's manually add edge weights for clarity if not automatically done by project.
# For project, the default is to add edges if they share at least one common neighbor.
# If you want the count, you might need a more manual approach or use specific projection methods.
# For simplicity here, we'll just show the connection.

# --- 5. Visualize the Projected Graph ---
plt.figure(figsize=(10, 8))
# Use a different layout for the projected graph
pos_actors = nx.circular_layout(G_actors) # Or spring_layout, spectral_layout etc.

# Draw nodes (actors)
nx.draw_nodes(G_actors, pos_actors, node_color='skyblue', node_size=2500,
              alpha=0.9, edgecolors='black', linewidths=1)

# Draw edges. We can make thicker edges for more shared movies (if we calculate weights).
# For this example, we'll just draw all edges.
nx.draw_edges(G_actors, pos_actors, width=1.5, alpha=0.6, edge_color='darkgray')

# Draw labels for actors
nx.draw_labels(G_actors, pos_actors, font_size=10, font_weight='bold', font_color='darkblue')

plt.title("Actor Co-occurrence Network (Connected if in same movie)", size=16)
plt.axis('off')
plt.show()

# --- Optional: Analyze Centrality (Who is most connected?) ---
print("\n--- Centrality Analysis (Actor-Actor Network) ---")
# Degree Centrality: Number of direct connections an actor has.
degree_centrality = nx.degree_centrality(G_actors)
print("\nDegree Centrality (Number of direct co-appearances):")
for actor, centrality in sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True):
    print(f"{actor}: {centrality:.3f}")

# Betweenness Centrality: Measures the extent to which a node lies on paths between other nodes.
# Actors with high betweenness centrality are important bridges in the network.
betweenness_centrality = nx.betweenness_centrality(G_actors)
print("\nBetweenness Centrality (Bridging connections):")
for actor, centrality in sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True):
    print(f"{actor}: {centrality:.3f}")

# Closeness Centrality: Measures how 'close' an actor is to all other actors in the network.
# Actors with high closeness centrality can quickly reach other actors.
closeness_centrality = nx.closeness_centrality(G_actors)
print("\nCloseness Centrality (How quickly they can reach others):")
for actor, centrality in sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True):
    print(f"{actor}: {centrality:.3f}")


--- Loading IMDb Datasets ---
Error: title.basics.tsv.gz not found. Please download it from https://datasets.imdbws.com/ and place it in ../data
Error: title.principals.tsv.gz not found. Please download it from https://datasets.imdbws.com/ and place it in ../data
Error: name.basics.tsv.gz not found. Please download it from https://datasets.imdbws.com/ and place it in ../data
Exiting due to missing IMDb data files. Please ensure they are downloaded and placed correctly.

--- Preprocessing Data for Network Analysis ---


TypeError: 'NoneType' object is not subscriptable