# Project 1, Part 8: Interactive Film Similarity Graph

This notebook takes the output of our Machine Learning pipeline—the cosine similarity matrix—and builds an interactive network graph to visualize film relationships.

**Objective:**
Create a function that, given a movie title, finds its most similar counterparts and displays the relationships as a dynamic, explorable graph.

**Methodology:**
1.  **Load Processed Data:** Load the similarity matrix (`.npy`) and the movie title-to-index mapping (`.pkl`) that we saved from the feature engineering script.
2.  **Similarity Function:** Create a helper function to retrieve the top N most similar movies for a given title by looking up scores in the similarity matrix.
3.  **Graph Visualization:** Use the **`pyvis`** library to build the network. The source movie will be a central node, and its similar movies will be connected by edges whose thickness or label indicates the similarity score.

In [2]:
import numpy as np
import pandas as pd
import pickle
from pyvis.network import Network
import os

# --- 1. Load Pre-computed Data ---
PROCESSED_DATA_DIR = "../data/processed"
SIMILARITY_MATRIX_PATH = os.path.join(PROCESSED_DATA_DIR, 'cosine_similarity_matrix.npy')
INDICES_PATH = os.path.join(PROCESSED_DATA_DIR, 'movie_indices.pkl')

# Load the data from the files our script created
cosine_sim = np.load(SIMILARITY_MATRIX_PATH)

with open(INDICES_PATH, 'rb') as f:
    indices = pickle.load(f)

print("Successfully loaded similarity matrix and movie indices.")
print(f"Shape of similarity matrix: {cosine_sim.shape}")
print(f"Number of movies in index: {len(indices)}")

# --- 2. Create Helper Function for Recommendations ---
def get_similar_movies(title, similarity_matrix, index_map, count=10):
    """
    Finds the top N similar movies to a given title.
    """
    if title not in index_map:
        print(f"Error: Movie '{title}' not found in the dataset.")
        return None
        
    # Get the index of the movie that matches the title
    idx = index_map[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies (slicing from 1 to skip itself)
    sim_scores = sim_scores[1:count+1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Get the movie titles and their corresponding scores
    similar_movies = index_map.iloc[movie_indices]
    similarity_values = [score[1] for score in sim_scores]
    
    return list(zip(similar_movies.index, similar_movies.values, similarity_values))

# --- 3. Build the Interactive Graph ---
def build_similarity_graph(start_movie, sim_matrix, index_map, num_similar=7):
    
    similar_movies = get_similar_movies(start_movie, sim_matrix, index_map, count=num_similar)
    
    if similar_movies is None: # Handle movie not found case
        return None

    # Initialize the graph
    net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='in_line')
    net.force_atlas_2based(gravity=-80, spring_length=200)

    # Add the central node (the movie you searched for)
    net.add_node(start_movie, label=start_movie, color="#FF4500", size=30, shape='star')

    # Add the similar movie nodes and the edges connecting them
    for title, tconst, score in similar_movies:
        net.add_node(title, label=title, size=15)
        net.add_edge(start_movie, title, value=score*2, title=f"Similarity: {score:.2f}")

    net.show_buttons(filter_=['physics'])
    return net

# --- 4. Generate and Display a Graph ---
# Let's test with a classic Pre-Code film that has horror elements!
movie_title = 'The Old Dark House' 
graph = build_similarity_graph(movie_title, cosine_sim, indices)

if graph:
    # Save the graph to an HTML file
    graph.show("pre_code_horror_similarity.html")
    # Display the graph in the notebook output
    display(graph)

Successfully loaded similarity matrix and movie indices.
Shape of similarity matrix: (4581, 4581)
Number of movies in index: 4581
pre_code_horror_similarity.html


<class 'pyvis.network.Network'> |N|=8 |E|=7