# Exercise 1. Finding Similar Actors based on Genre
Download imdb_movies_2000to2022.prolific.json Links to an external site. from GitHub (also available on Canvas in Exercises/data)
Create a data frame, where each row corresponds to an actor, each column represents a genre, and each cell captures how many times that row’s actor has appeared in that column’s genre
Using this data frame as your “feature matrix”, select an actor (called your “query”) for whom you want to find the top 10 most similar actors based on the genres in which they’ve starred
As an example, select the row from your data frame associated with Chris Hemsworth, actor ID “nm1165110”, as your “query” actor
Use sklearn.metrics.DistanceMetric Links to an external site. to calculate the euclidean distances between your query actor and all other actors based on their genre appearances
Print a list of the top ten actors most similar to your query actor using Euclidean distance

In [2]:
import json
import pandas as pd
from sklearn.metrics import DistanceMetric
import numpy as np

# Load the data
with open('imdb_movies_2000to2022.prolific.json', 'r') as f:
    movies = [json.loads(line) for line in f]

# Step 1: Create actor-genre matrix
actor_genre_counts = {}

for movie in movies:
    genres = movie.get('genres', [])
    actors = movie.get('actors', [])
    
    for actor in actors:
        if len(actor) >= 2:  # Ensure actor has ID and name
            actor_id = actor[0]
            actor_name = actor[1]
            
            if actor_id not in actor_genre_counts:
                actor_genre_counts[actor_id] = {'name': actor_name}
            
            # Count genre appearances
            for genre in genres:
                if genre in actor_genre_counts[actor_id]:
                    actor_genre_counts[actor_id][genre] += 1
                else:
                    actor_genre_counts[actor_id][genre] = 1

# Convert to DataFrame
df_actors = pd.DataFrame.from_dict(actor_genre_counts, orient='index')
df_actors = df_actors.fillna(0)

# Extract just the genre columns (exclude 'name' column for distance calculation)
genre_columns = [col for col in df_actors.columns if col != 'name']
actor_features = df_actors[genre_columns]

# Step 2: Find similar actors to Chris Hemsworth
query_actor_id = "nm1165110"  # Chris Hemsworth

if query_actor_id in actor_features.index:
    # Calculate Euclidean distances
    dist = DistanceMetric.get_metric('euclidean')
    distances = dist.pairwise(actor_features)
    
    # Create a Series with distances from query actor
    query_idx = actor_features.index.get_loc(query_actor_id)
    distance_series = pd.Series(distances[query_idx], index=actor_features.index)
    
    # Get top 10 most similar (excluding the actor themselves)
    most_similar = distance_series.nsmallest(11)  # Get 11 to exclude self
    most_similar = most_similar[most_similar.index != query_actor_id].head(10)
    
    print("Top 10 actors most similar to Chris Hemsworth based on genre appearances:")
    print("=" * 70)
    for actor_id, distance in most_similar.items():
        actor_name = df_actors.loc[actor_id, 'name']
        print(f"{actor_id}\t{actor_name}\t(Distance: {distance:.4f})")
else:
    print(f"Actor {query_actor_id} not found in dataset")

Top 10 actors most similar to Chris Hemsworth based on genre appearances:
nm0000129	Tom Cruise	(Distance: 5.4772)
nm0147147	Henry Cavill	(Distance: 8.2462)
nm0879085	Tyrese Gibson	(Distance: 8.8318)
nm0089217	Orlando Bloom	(Distance: 9.0554)
nm0004874	Vin Diesel	(Distance: 9.2195)
nm0001401	Angelina Jolie	(Distance: 9.2195)
nm0001600	Adrian Paul	(Distance: 9.5917)
nm0001472	Jet Li	(Distance: 9.5917)
nm0001092	Mark Dacascos	(Distance: 9.7980)
nm0001444	Lorenzo Lamas	(Distance: 10.0499)


# Extra Practice. Finding Similar Actors based on Co-stars 
Download imdb_movies_2000to2022.prolific.json Links to an external site. from GitHub (also available on Canvas in Exercises/data)
Create a data frame, where each row and column correspond to actors, and each cell captures how many times that row’s actor has appeared in a movie with the column’s actor
You can also use the networkx.adjacency_matrix() Links to an external site. function to create this matrix from the graph you’ve created previously
As in the genres exercise, select a query actor and find the top 10 most similar actors based on costar similarity
Select Scarlett Johansson, “nm0424060”, as your target 
Use sklearn.metrics.DistanceMetric Links to an external site. to calculate the euclidean distances between your query actor and all other actors based on their genre appearances
Print a list of the top ten actors most similar to your query actor
Example for Scarlett Johansson is: 
nm0424060 Scarlett Johansson
nm0749263 Mark Ruffalo
nm0812307 Peter Sohn
nm1165110 Chris Hemsworth
nm0538683 Mako
nm9125822 Max Ivutin
nm10309420 Dayna Hilton
nm10558382 Ricochet
nm1812991 Tori Kelly
nm0262635 Chris Evans

In [None]:
import networkx as nx
from itertools import combinations

# Step 1: Create co-star matrix using NetworkX
G = nx.Graph()

for movie in movies:
    actors = [actor[0] for actor in movie.get('actors', []) if len(actor) >= 2]
    actor_names = {actor[0]: actor[1] for actor in movie.get('actors', []) if len(actor) >= 2}
    
    # Add edges between all pairs of actors in the same movie
    for actor1, actor2 in combinations(actors, 2):
        if G.has_edge(actor1, actor2):
            G[actor1][actor2]['weight'] += 1
        else:
            G.add_edge(actor1, actor2, weight=1)
        
        # Add actor names as node attributes
        if 'name' not in G.nodes[actor1]:
            G.nodes[actor1]['name'] = actor_names[actor1]
        if 'name' not in G.nodes[actor2]:
            G.nodes[actor2]['name'] = actor_names[actor2]

# Create adjacency matrix
adj_matrix = nx.adjacency_matrix(G, weight='weight')
adj_df = pd.DataFrame(adj_matrix.todense(), 
                     index=G.nodes(), 
                     columns=G.nodes())

# Step 2: Find similar actors to Scarlett Johansson
query_actor_id = "nm0424060"  # Scarlett Johansson

if query_actor_id in adj_df.index:
    # Calculate Euclidean distances
    dist = DistanceMetric.get_metric('euclidean')
    distances = dist.pairwise(adj_df)
    
    # Create a Series with distances from query actor
    query_idx = adj_df.index.get_loc(query_actor_id)
    distance_series = pd.Series(distances[query_idx], index=adj_df.index)
    
    # Get top 10 most similar (excluding the actor themselves)
    most_similar = distance_series.nsmallest(11)  # Get 11 to exclude self
    most_similar = most_similar[most_similar.index != query_actor_id].head(10)
    
    print("\n" + "="*70)
    print("Top 10 actors most similar to Scarlett Johansson based on co-star relationships:")
    print("=" * 70)
    for actor_id, distance in most_similar.items():
        actor_name = G.nodes[actor_id]['name']
        print(f"{actor_id}\t{actor_name}")
else:
    print(f"Actor {query_actor_id} not found in dataset")