# Outlier Detection (Local Outlier Factor - Movies)
In this notebook, we compute Local Outlier Factor scores for each movie represented by its popular features to detect outliers. LOF is a "normalized distance-based approach where the normalization factor corresponds to the average local data density." [Data Mining Textbook]

Intuitively, we find that movies with unusual feature combinations, namely unusual cast ensembles or surprising individual casting choices for the genre, are given the highest LOF score, thus indicating the highest "outlier" rating.

In [18]:
import os
import numpy as np
import pandas as pd
import pickle
import gzip
import json

import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# Enable notebook renderer for Altair
alt.renderers.enable('default')

DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")
MIN_OCCURRENCES = 20

## Load Data

In [19]:
# Load movie features
MOVIE_FEATURES_PATH = os.path.join(DATA_PATH, f"processed/movie_features_{MIN_OCCURRENCES}.pickle")
with open(MOVIE_FEATURES_PATH, "rb") as f:
    movie_features = pickle.load(f)

# Load feature mapping
FEATURE_MAPPING_PATH = os.path.join(DATA_PATH, f"processed/feature_mapping_{MIN_OCCURRENCES}.pickle")
with open(FEATURE_MAPPING_PATH, "rb") as f:
    feature_mapping = pickle.load(f)

feature_to_id = feature_mapping['feature_to_id']
id_to_feature = feature_mapping['id_to_feature']

# Number of features
num_features = len(feature_to_id)
print(f"Number of features: {num_features}")

Number of features: 450


### Create Movie Feature Matrix

In [20]:
# List of movie IDs and feature IDs
movie_ids = list(movie_features.keys())
feature_ids = list(id_to_feature.keys())

# Create an empty DataFrame
movie_feature_matrix = pd.DataFrame(0, index=movie_ids, columns=feature_ids)

# Fill the DataFrame
for movie_id, features in movie_features.items():
    movie_feature_matrix.loc[movie_id, features] = 1

print(f"Movie feature matrix shape: {movie_feature_matrix.shape}")

Movie feature matrix shape: (9443, 450)


In [21]:
# Load Netflix to IMDb mapping
NETFLIX_TO_IMDB_PATH = os.path.join(DATA_PATH, "netflix_to_imdb.json")
with open(NETFLIX_TO_IMDB_PATH, "r") as f:
    netflix_to_imdb = json.load(f)

# Create a mapping from Netflix movie IDs to IMDb IDs
netflix_ids = set(movie_ids)
netflix_to_imdb_filtered = {nid: imdb_id for nid, imdb_id in netflix_to_imdb.items() if nid in netflix_ids}

In [22]:
# Load IMDb title.basics.tsv.gz
TITLE_BASICS_PATH = os.path.join(IMDB_FOLDER_PATH, "title.basics.tsv.gz")

imdb_titles = {}

with gzip.open(TITLE_BASICS_PATH, 'rt', encoding='utf-8') as f:
    # Skip header
    next(f)
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) != 9:
            continue
        tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, genres = parts
        imdb_titles[tconst] = primaryTitle

print(f"Loaded {len(imdb_titles)} IMDb titles.")

Loaded 11230548 IMDb titles.


In [23]:
# Map Netflix movie IDs to titles
movie_titles = {}

for netflix_id in movie_ids:
    imdb_id = netflix_to_imdb_filtered.get(netflix_id)
    if imdb_id and imdb_id in imdb_titles:
        movie_titles[netflix_id] = imdb_titles[imdb_id]
    else:
        movie_titles[netflix_id] = f"Unknown Title ({netflix_id})"

# Add movie titles to the DataFrame
movie_feature_matrix['title'] = movie_feature_matrix.index.map(movie_titles)

In [24]:
# Create a list of feature names for each movie
def get_feature_names(feature_ids):
    return [id_to_feature[feat_id] for feat_id in feature_ids]

movie_feature_matrix['features'] = movie_feature_matrix.index.map(
    lambda x: ', '.join(get_feature_names(movie_features.get(x, [])))
)

# Local Outlier Factor

In [25]:
from sklearn.neighbors import LocalOutlierFactor

# Drop the 'title' and 'features' columns to focus on numeric data
feature_data = movie_feature_matrix.drop(columns=['title', 'features'])
feature_data_scaled = feature_data

# Initialize the Local Outlier Factor (LOF) model
# Setting n_neighbors to a reasonable value (e.g., 20 or 5% of the dataset)
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)  # 5% outliers

# Fit the model and predict outlier scores
lof_scores = -lof.fit_predict(feature_data)
# The negative values indicate how strong the outlier effect is (higher means more outlier-like)

# Add LOF scores to the original movie feature matrix
movie_feature_matrix['LOF_Score'] = lof.negative_outlier_factor_

# Sort movies by their LOF score (outliers at the top)
outliers = movie_feature_matrix.sort_values(by='LOF_Score', ascending=True)

# Display the top potential outliers
print(outliers.head(10)[['title', 'features', 'LOF_Score']])

                       title  \
8782    The Royal Tenenbaums   
10078      Indecent Proposal   
2499   America's Sweethearts   
4171           Mars Attacks!   
3859        The Marrying Man   
14355              Bowfinger   
10131              Diggstown   
13813          Nobody's Fool   
11724              The Paper   
16802         The Grass Harp   

                                                features     LOF_Score  
8782   Cast:Anjelica Huston, Cast:Ben Stiller, Cast:B... -2.542708e+10  
10078  Cast:Billy Bob Thornton, Cast:Demi Moore, Cast... -2.449490e+10  
2499   Cast:Alan Arkin, Cast:Christopher Walken, Cast... -2.449490e+10  
4171   Cast:Danny DeVito, Cast:Glenn Close, Cast:Jack... -2.396399e+10  
3859   Cast:Alec Baldwin, Cast:Elisabeth Shue, Cast:K... -2.381176e+10  
14355  Cast:Eddie Murphy, Cast:Heather Graham, Cast:R... -2.372344e+10  
10131  Cast:Bruce Dern, Cast:Heather Graham, Cast:Jam... -2.327015e+10  
13813  Cast:Bruce Willis, Cast:Melanie Griffith, Cast... -2.327

In [26]:
outliers.head(10)[['title', 'features', 'LOF_Score']]

Unnamed: 0,title,features,LOF_Score
8782,The Royal Tenenbaums,"Cast:Anjelica Huston, Cast:Ben Stiller, Cast:B...",-25427080000.0
10078,Indecent Proposal,"Cast:Billy Bob Thornton, Cast:Demi Moore, Cast...",-24494900000.0
2499,America's Sweethearts,"Cast:Alan Arkin, Cast:Christopher Walken, Cast...",-24494900000.0
4171,Mars Attacks!,"Cast:Danny DeVito, Cast:Glenn Close, Cast:Jack...",-23963990000.0
3859,The Marrying Man,"Cast:Alec Baldwin, Cast:Elisabeth Shue, Cast:K...",-23811760000.0
14355,Bowfinger,"Cast:Eddie Murphy, Cast:Heather Graham, Cast:R...",-23723440000.0
10131,Diggstown,"Cast:Bruce Dern, Cast:Heather Graham, Cast:Jam...",-23270150000.0
13813,Nobody's Fool,"Cast:Bruce Willis, Cast:Melanie Griffith, Cast...",-23270150000.0
11724,The Paper,"Cast:Glenn Close, Cast:Jason Robards, Cast:Mic...",-23270150000.0
16802,The Grass Harp,"Cast:Charles Durning, Cast:Jack Lemmon, Cast:M...",-23270150000.0
