# Clustering (Movies)
In this notebook, we cluster movies based on their features using K-Means and DBSCAN algorithms. Each movie is represented as a one-hot encoded vector indicating the presence or absence of the 450 most popular movie features.

Interestingly, we find that feature-rich movies with exceptionally popular "superstar" cast are detected as outliers by DBSCAN when applied directly to the movie features<br>
- Batman
- Batman Returns
- Antz
- Hook
- The Royal Tenenbaums
- Con Air
- National Treasure
- Sleepy Hollow

Intuitively, this makes sense as the average movie has 3-5 popular features. Thus, a movie with 10-15 popular features will simply not have enough neighbours within a distance epsilon to be assigned to a cluster. We find that this result mostly disappears when peforming DBSCAN on PCA-reduced features as pure counts are replaced with principal components.

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import gzip

import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# Enable notebook renderer for Altair
alt.renderers.enable('default')

DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")
MIN_OCCURRENCES = 20

## Load Data

In [None]:
# Load movie features
MOVIE_FEATURES_PATH = os.path.join(DATA_PATH, f"processed/movie_features_{MIN_OCCURRENCES}.pickle")
with open(MOVIE_FEATURES_PATH, "rb") as f:
    movie_features = pickle.load(f)

# Load feature mapping
FEATURE_MAPPING_PATH = os.path.join(DATA_PATH, f"processed/feature_mapping_{MIN_OCCURRENCES}.pickle")
with open(FEATURE_MAPPING_PATH, "rb") as f:
    feature_mapping = pickle.load(f)

feature_to_id = feature_mapping['feature_to_id']
id_to_feature = feature_mapping['id_to_feature']

# Number of features
num_features = len(feature_to_id)
print(f"Number of features: {num_features}")

### Create Movie Feature Matrix

In [None]:
# List of movie IDs and feature IDs
movie_ids = list(movie_features.keys())
feature_ids = list(id_to_feature.keys())

# Create an empty DataFrame
movie_feature_matrix = pd.DataFrame(0, index=movie_ids, columns=feature_ids)

# Fill the DataFrame
for movie_id, features in movie_features.items():
    movie_feature_matrix.loc[movie_id, features] = 1

print(f"Movie feature matrix shape: {movie_feature_matrix.shape}")

In [4]:
import json

# Load Netflix to IMDb mapping
NETFLIX_TO_IMDB_PATH = os.path.join(DATA_PATH, "netflix_to_imdb.json")
with open(NETFLIX_TO_IMDB_PATH, "r") as f:
    netflix_to_imdb = json.load(f)

# Create a mapping from Netflix movie IDs to IMDb IDs
netflix_ids = set(movie_ids)
netflix_to_imdb_filtered = {nid: imdb_id for nid, imdb_id in netflix_to_imdb.items() if nid in netflix_ids}

In [None]:
# Load IMDb title.basics.tsv.gz
TITLE_BASICS_PATH = os.path.join(IMDB_FOLDER_PATH, "title.basics.tsv.gz")

imdb_titles = {}

with gzip.open(TITLE_BASICS_PATH, 'rt', encoding='utf-8') as f:
    # Skip header
    next(f)
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) != 9:
            continue
        tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres = parts
        imdb_titles[tconst] = primaryTitle

print(f"Loaded {len(imdb_titles)} IMDb titles.")

In [6]:
# Map Netflix movie IDs to titles
movie_titles = {}

for netflix_id in movie_ids:
    imdb_id = netflix_to_imdb_filtered.get(netflix_id)
    if imdb_id and imdb_id in imdb_titles:
        movie_titles[netflix_id] = imdb_titles[imdb_id]
    else:
        movie_titles[netflix_id] = f"Unknown Title ({netflix_id})"

# Add movie titles to the DataFrame
movie_feature_matrix['title'] = movie_feature_matrix.index.map(movie_titles)

In [7]:
# Create a list of feature names for each movie
def get_feature_names(feature_ids):
    return [id_to_feature[feat_id] for feat_id in feature_ids]

movie_feature_matrix['features'] = movie_feature_matrix.index.map(
    lambda x: ', '.join(get_feature_names(movie_features.get(x, [])))
)

# K-Means Clustering

### Without PCA

In [8]:
def compute_clustering_metrics(data, k_range):
    silhouette_scores = []
    ch_scores = []

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(data)
        silhouette_scores.append(silhouette_score(data, labels))
        ch_scores.append(calinski_harabasz_score(data, labels))
    
    return silhouette_scores, ch_scores

In [9]:
# Exclude 'title' and 'features' columns for clustering
movie_feature_matrix_no_meta = movie_feature_matrix.drop(['title', 'features'], axis=1)

k_range = range(2, 21)
silhouette_scores, ch_scores = compute_clustering_metrics(movie_feature_matrix_no_meta, k_range)

Plot Clustering Metrics

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.plot(k_range, silhouette_scores, marker='o')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Silhouette Score')
ax1.set_title('Silhouette Score vs. Number of Clusters')

ax2.plot(k_range, ch_scores, marker='s')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Calinski-Harabasz Index')
ax2.set_title('Calinski-Harabasz Index vs. Number of Clusters')

plt.show()

Choose the best k

In [None]:
best_k = k_range[np.argmax(silhouette_scores)]
print(f"Best k according to Silhouette Score: {best_k}")

# Perform K-Means clustering with the best k
kmeans = KMeans(n_clusters=best_k, random_state=42)
labels = kmeans.fit_predict(movie_feature_matrix_no_meta)

# Add cluster labels to the DataFrame
movie_feature_matrix['cluster'] = labels.astype(str)

Visualize Clusters in 2D

In [None]:
# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
movie_features_2d = pca.fit_transform(movie_feature_matrix_no_meta)

# Create a DataFrame for plotting
plot_df = pd.DataFrame(movie_features_2d, columns=['PC1', 'PC2'])
plot_df['title'] = movie_feature_matrix['title'].values
plot_df['features'] = movie_feature_matrix['features'].values
plot_df['cluster'] = movie_feature_matrix['cluster'].values

# Create an interactive scatter plot using Altair
alt.data_transformers.disable_max_rows()

scatter = alt.Chart(plot_df).mark_circle(size=60).encode(
    x=alt.X('PC1', scale=alt.Scale(zero=False)),
    y=alt.Y('PC2', scale=alt.Scale(zero=False)),
    color='cluster:N',
    tooltip=['title', 'features']
).properties(
    width=800,
    height=600,
    title='K-Means Clustering of Movies Visualized in 2D'
).interactive()

scatter

### With PCA

In [None]:
# Reduce to 50 principal components
pca_full = PCA(n_components=50)
movie_features_pca = pca_full.fit_transform(movie_feature_matrix_no_meta)

print(f"Explained variance ratio of 50 components: {np.sum(pca_full.explained_variance_ratio_):.4f}")

In [14]:
k_range = range(2, 21)
silhouette_scores_pca, ch_scores_pca = compute_clustering_metrics(movie_features_pca, k_range)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.plot(k_range, silhouette_scores_pca, marker='o')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Silhouette Score')
ax1.set_title('Silhouette Score vs. Number of Clusters (PCA Reduced Data)')

ax2.plot(k_range, ch_scores_pca, marker='s')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Calinski-Harabasz Index')
ax2.set_title('Calinski-Harabasz Index vs. Number of Clusters (PCA Reduced Data)')

plt.show()

In [None]:
best_k_pca = k_range[np.argmax(silhouette_scores_pca)]
print(f"Best k according to Silhouette Score (PCA Data): {best_k_pca}")

# Perform K-Means clustering with the best k
kmeans_pca = KMeans(n_clusters=best_k_pca, random_state=42)
labels_pca = kmeans_pca.fit_predict(movie_features_pca)

# Add cluster labels to the DataFrame
movie_feature_matrix['cluster_pca'] = labels_pca.astype(str)

# Further reduce to 2 principal components for visualization
pca_2d = PCA(n_components=2)
movie_features_2d_pca = pca_2d.fit_transform(movie_features_pca)

# Create a DataFrame for plotting
plot_df_pca = pd.DataFrame(movie_features_2d_pca, columns=['PC1', 'PC2'])
plot_df_pca['title'] = movie_feature_matrix['title'].values
plot_df_pca['features'] = movie_feature_matrix['features'].values
plot_df_pca['cluster'] = movie_feature_matrix['cluster_pca'].values


In [None]:
# Create an interactive scatter plot using Altair
scatter_pca = alt.Chart(plot_df_pca).mark_circle(size=60).encode(
    x=alt.X('PC1', scale=alt.Scale(zero=False)),
    y=alt.Y('PC2', scale=alt.Scale(zero=False)),
    color='cluster:N',
    tooltip=['title', 'features']
).properties(
    width=800,
    height=600,
    title='K-Means Clustering of Movies (PCA Reduced Data) Visualized in 2D'
).interactive()

scatter_pca

# DBSCAN

### Without PCA

In [17]:
def plot_k_distance(data, k):
    nbrs = NearestNeighbors(n_neighbors=k).fit(data)
    distances, indices = nbrs.kneighbors(data)
    distances = np.sort(distances[:, k-1], axis=0)
    plt.figure(figsize=(10, 6))
    plt.plot(distances)
    plt.ylabel(f'{k}th Nearest Neighbor Distance')
    plt.xlabel('Data Points sorted by distance')
    plt.title(f'K-distance Graph for k={k}')
    plt.show()

In [None]:
plot_k_distance(movie_feature_matrix_no_meta, k=5)

In [None]:
epsilon = 2.5 
dbscan = DBSCAN(eps=epsilon, min_samples=5)
dbscan_labels = dbscan.fit_predict(movie_feature_matrix_no_meta)

# Add labels to the DataFrame
movie_feature_matrix['dbscan_cluster'] = dbscan_labels.astype(str)
plot_df['dbscan_cluster'] = dbscan_labels.astype(str)

# Create an interactive scatter plot using Altair
scatter_dbscan = alt.Chart(plot_df).mark_circle(size=60).encode(
    x=alt.X('PC1', scale=alt.Scale(zero=False)),
    y=alt.Y('PC2', scale=alt.Scale(zero=False)),
    color='dbscan_cluster:N',
    tooltip=['title', 'features']
).properties(
    width=800,
    height=600,
    title='DBSCAN Clustering of Movies Visualized in 2D'
).interactive()

scatter_dbscan

### With PCA

In [None]:
plot_k_distance(movie_features_pca, k=5)

In [None]:
epsilon_pca = 1.5  # Adjust based on the k-distance graph for PCA data
dbscan_pca = DBSCAN(eps=epsilon_pca, min_samples=5)
dbscan_labels_pca = dbscan_pca.fit_predict(movie_features_pca)

# Add labels to the DataFrame
movie_feature_matrix['dbscan_cluster_pca'] = dbscan_labels_pca.astype(str)
plot_df_pca['dbscan_cluster'] = dbscan_labels_pca.astype(str)

# Create an interactive scatter plot using Altair
scatter_dbscan_pca = alt.Chart(plot_df_pca).mark_circle(size=60).encode(
    x=alt.X('PC1', scale=alt.Scale(zero=False)),
    y=alt.Y('PC2', scale=alt.Scale(zero=False)),
    color='dbscan_cluster:N',
    tooltip=['title', 'features']
).properties(
    width=800,
    height=600,
    title='DBSCAN Clustering of Movies (PCA Reduced Data) Visualized in 2D'
).interactive()

scatter_dbscan_pca