# IMDB Feature Extraction
In this notebook we extract movie features from [IMDB datasets](https://datasets.imdbws.com/) to map directly to the Netflix Prize dataset by adapting scripts from [this article by Ilya Grigorik](https://www.igvita.com/2007/01/27/correlating-netflix-and-imdb-datasets/). Each movie is mapped to an array of feature ids, which map to a wide variety of features (director, actor, composer, genre, decade, etc)

This notebook produces two pickle files (where n is the minimum feature count threshold):
1. `feature_mapping_{n}.pickle` (maps new serialized feature id's to feature)
2. `feature_frequencies_{n}.pickle` (maps the feature id's to their counts)
3. `movie_features_{n}.pickle` (maps each Netflix movie id to an array of feature id's)

In [1]:
import pandas as pd
import numpy as np
import gzip
from fuzzywuzzy import fuzz, process
import itertools
from typing import Dict, Set, Any
import pickle
import os
import json
from collections import Counter

In [2]:
DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")

## Utils
`load_netflix_titles()`
- You guessed it, loads netflix movie_titles.txt

`load_imdb_data()`
- Loads all the IMDB datasets (name, title.basics, title.crew, title.principals, title.akas)
- Only keep movies, TV show seasons and episodes, concerts, etc. are out of scope

`FeatureExtractor`
- Assigns id's to each feature in the IMDB dataset
- Counts the occurrences of each feature for each movie, mapping movies with features

In [3]:
def load_netflix_titles() -> Dict[int, Dict[str, str]]:
    """Load Netflix Prize movie titles"""
    netflix_titles = {}
    netflix_titles_path = os.path.join(NETFLIX_FOLDER_PATH, "movie_titles.txt")
    
    with open(netflix_titles_path, 'r', encoding='latin-1') as f:
        for line in f:
            movie_id, year, title = line.strip().split(',', 2)
            netflix_titles[int(movie_id)] = {
                'title': title,
                'year': year
            }
    return netflix_titles

def load_imdb_data() -> Dict[str, pd.DataFrame]:
    """Load all required IMDB datasets"""
    datasets = {
        'names': 'name.basics.tsv.gz',
        'titles': 'title.basics.tsv.gz',
        'crew': 'title.crew.tsv.gz',
        'principals': 'title.principals.tsv.gz'
    }
    
    data = {}
    for key, filename in datasets.items():
        file_path = os.path.join(IMDB_FOLDER_PATH, filename)
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            data[key] = pd.read_csv(f, sep='\t', na_values='\\N')
    
    # Filter titles to movies only
    data['movies'] = data['titles'][data['titles']['titleType'] == 'movie']
    
    return data


In [4]:
class FeatureExtractor:
    def __init__(self, imdb_data: Dict[str, pd.DataFrame], netflix_to_imdb: Dict[int, str]):
        print("Initializing FeatureExtractor...")
        self.feature_id_counter = itertools.count(1)
        self.feature_to_id: Dict[str, int] = {}
        self.feature_counts: Dict[str, int] = {}
        
        # Get the set of IMDB IDs we actually need
        relevant_imdb_ids = set(netflix_to_imdb.values())
        print(f"Filtering data for {len(relevant_imdb_ids)} matched movies...")
        
        # Convert startYear to numeric
        imdb_data['movies']['startYear'] = pd.to_numeric(imdb_data['movies']['startYear'], errors='coerce')
        
        # Filter out movies after 2005 and create index
        movies_mask = (
            (imdb_data['movies']['startYear'] <= 2005) & 
            (imdb_data['movies']['tconst'].isin(relevant_imdb_ids))
        )
        self.movies = imdb_data['movies'][movies_mask].set_index('tconst')
        print(f"Filtered movies to {len(self.movies)} entries")
        
        # Filter and index crew data
        self.crew = imdb_data['crew'][imdb_data['crew']['tconst'].isin(relevant_imdb_ids)].set_index('tconst')
        print(f"Filtered crew data to {len(self.crew)} entries")
        
        # Get relevant person IDs from crew data
        person_ids = set()
        for crew_row in self.crew.itertuples():
            for role in ['directors', 'writers']:
                if hasattr(crew_row, role) and pd.notna(getattr(crew_row, role)):
                    person_ids.update(getattr(crew_row, role).split(','))
        
        # Filter principals and get additional person IDs
        relevant_principals = imdb_data['principals'][
            imdb_data['principals']['tconst'].isin(relevant_imdb_ids)
        ]
        person_ids.update(relevant_principals['nconst'].dropna().unique())
        
        # Filter and index names data to only relevant people
        self.names = imdb_data['names'][imdb_data['names']['nconst'].isin(person_ids)].set_index('nconst')
        print(f"Filtered names data to {len(self.names)} entries")
        
        # Create principals lookup dictionary
        print("Creating principals mappings...")
        self.principals_dict = {}
        category_mapping = {
            'actor': 'Cast',
            'actress': 'Cast',
            'producer': 'Producer',
            'composer': 'Composer'
        }
        
        for _, row in relevant_principals.iterrows():
            if row['category'] in category_mapping:
                movie_id = row['tconst']
                if movie_id not in self.principals_dict:
                    self.principals_dict[movie_id] = []
                if pd.notna(row['nconst']):
                    self.principals_dict[movie_id].append(
                        (category_mapping[row['category']], row['nconst'])
                    )
        
        print("FeatureExtractor initialization complete")
    
    def get_feature_id(self, feature: str) -> int:
        if feature not in self.feature_to_id:
            self.feature_to_id[feature] = next(self.feature_id_counter)
            self.feature_counts[feature] = 1
        else:
            self.feature_counts[feature] += 1
        return self.feature_to_id[feature]
    
    def extract_features(self, imdb_id: str) -> Set[int]:
        """Extract all features for a movie"""
        feature_names = set()
        
        # Get movie details
        if imdb_id not in self.movies.index:
            return set()
        
        movie = self.movies.loc[imdb_id]
        
        # Process crew
        if imdb_id in self.crew.index:
            crew_row = self.crew.loc[imdb_id]
            for role in ['directors', 'writers']:
                if pd.notna(crew_row[role]):
                    for person_id in crew_row[role].split(','):
                        if person_id in self.names.index:
                            name = self.names.loc[person_id, 'primaryName']
                            if pd.notna(name):
                                feature = f"{role.title()[:-1]}:{name}"
                                feature_names.add(feature)
        
        # Process principals 
        if imdb_id in self.principals_dict:
            for category, person_id in self.principals_dict[imdb_id]:
                if person_id in self.names.index:
                    name = self.names.loc[person_id, 'primaryName']
                    if pd.notna(name):
                        feature = f"{category}:{name}"
                        feature_names.add(feature)
        
        # Process genres
        if pd.notna(movie['genres']):
            for genre in movie['genres'].split(','):
                feature = f"Genre:{genre}"
                feature_names.add(feature)
        
        # Process decade
        if not np.isnan(movie['startYear']):
            decade = (int(movie['startYear']) // 10) * 10
            feature = f"Decade:{decade}s"
            feature_names.add(feature)
        
        # Process runtimeMinutes
        if pd.notna(movie['runtimeMinutes']):
            runtime = pd.to_numeric(movie['runtimeMinutes'], errors='coerce')
            if not np.isnan(runtime):
                # Quantize runtime into bins
                if runtime < 60:
                    runtime_bin = "<60min"
                elif runtime < 90:
                    runtime_bin = "60-90min"
                elif runtime < 120:
                    runtime_bin = "90-120min"
                else:
                    runtime_bin = ">120min"
                feature = f"Runtime:{runtime_bin}"
                feature_names.add(feature)

        features = set()
        for feature in feature_names:
            features.add(self.get_feature_id(feature))
        
        return features


In [6]:
def clean_title(title):
    """Clean and validate a title string"""
    if pd.isna(title) or not isinstance(title, str): return None
    title = str(title).lower().strip()
    if not any(c.isalpha() for c in title): return None
    return title

def match_titles(netflix_titles: Dict[int, Dict[str, str]], 
                 imdb_movies: pd.DataFrame,
                 similarity_threshold: int = 90) -> Dict[int, str]:
    """Match Netflix titles to IMDB titles using optimized fuzzy matching"""
    print("Preparing dataframes...")
    
    # Convert Netflix titles to DataFrame
    netflix_df = pd.DataFrame.from_dict(netflix_titles, orient='index')
    netflix_df['year'] = pd.to_numeric(netflix_df['year'], errors='coerce')
    netflix_df = netflix_df.reset_index().rename(columns={'index': 'netflix_id'})
    
    # Clean Netflix titles
    netflix_df['clean_title'] = netflix_df['title'].apply(clean_title)
    netflix_df = netflix_df.dropna(subset=['clean_title'])
    print(f"Netflix movies (after cleaning): {len(netflix_df)} of {len(netflix_titles)} total")
    
    # Clean up IMDB movies DataFrame
    imdb_df = imdb_movies.copy()
    imdb_df['startYear'] = pd.to_numeric(imdb_df['startYear'], errors='coerce')
    imdb_df['clean_title'] = imdb_df['primaryTitle'].apply(clean_title)
    imdb_df = imdb_df.dropna(subset=['clean_title'])
    print(f"IMDB movies (after cleaning): {len(imdb_df)} of {len(imdb_movies)} total")
    
    netflix_to_imdb = {}
    total = len(netflix_df)
    
    print("Starting matching process...")
    
    # Match each Netflix movie id to an IMDB movie id via fuzzy match
    for idx, netflix_row in netflix_df.iterrows():
        if idx % 100 == 0:
            print(f"Processed {idx}/{total} titles ({(idx/total)*100:.1f}%)")
        
        netflix_id = netflix_row['netflix_id']
        netflix_year = netflix_row['year']
        netflix_title = netflix_row['clean_title']
        
        # Filter IMDB movies by year (±1 year) for faster lookup
        if pd.notna(netflix_year):
            year_mask = (imdb_df['startYear'] >= netflix_year - 1) & \
                        (imdb_df['startYear'] <= netflix_year + 1)
            candidates = imdb_df[year_mask]
        else:
            candidates = imdb_df
        
        # Find best match among candidates
        best_match = None
        best_score = 0
        
        for _, imdb_row in candidates.iterrows():
            imdb_title = imdb_row['clean_title']
            score = fuzz.ratio(netflix_title, imdb_title)
            
            if score > best_score and score >= similarity_threshold:
                best_score = score
                best_match = imdb_row['tconst']
        
        if best_match:
            netflix_to_imdb[netflix_id] = best_match
            if idx % 100 == 0:
                matched_title = imdb_df[imdb_df['tconst'] == best_match]['primaryTitle'].iloc[0]
                print(f"Matched: {netflix_row['title']} -> {matched_title} (score: {best_score})")
    
    print(f"\nMatching complete. Found {len(netflix_to_imdb)} matches.")
    return netflix_to_imdb

def print_sample_matches(netflix_titles, imdb_movies, netflix_to_imdb, n=5):
    """Print n sample matches to verify matching quality"""
    print("\nSample matches:")
    samples = list(netflix_to_imdb.items())[:n]
    
    for netflix_id, imdb_id in samples:
        netflix_info = netflix_titles[netflix_id]
        imdb_info = imdb_movies[imdb_movies['tconst'] == imdb_id].iloc[0]
        print(f"\nNetflix: {netflix_info['title']} ({netflix_info['year']})")
        print(f"IMDB:    {imdb_info['primaryTitle']} ({imdb_info['startYear']})")
        print(f"IMDB ID: {imdb_id}")


## Extract Features

In [None]:
print("Loading data...")
netflix_titles = load_netflix_titles()
imdb_data = load_imdb_data()

We can see here the netflix dataset only goes up to 2005 so we will then prune out all >2005 movies

In [None]:
max(int(netflix_titles[entry]['year']) for entry in netflix_titles if netflix_titles[entry]['year'].isdigit())

Create mapping between Netflix and IMDB movie id's

In [9]:
# # Compute netflix to imdb id mapping
# print("Matching titles...")
# netflix_to_imdb = match_titles(netflix_titles, imdb_data['movies'])
# with open('data/netflix_to_imdb.json', 'w') as f: json.dump(netflix_to_imdb, f)  
# print_sample_matches(netflix_titles, imdb_data['movies'], netflix_to_imdb)
# print(f"Matched {len(netflix_to_imdb)} movies")

# import precomputed mapping
with open('data/netflix_to_imdb.json', 'r') as f: netflix_to_imdb = json.load(f) 

Use the id mappings to extract features for each movie (cast member, country, genre, etc) from the IMDB dataset

In [None]:
print("Extracting features...")
extractor = FeatureExtractor(imdb_data, netflix_to_imdb)
movie_features = {}

total = len(netflix_to_imdb)
for idx, (netflix_id, imdb_id) in enumerate(netflix_to_imdb.items(), 1):
    if idx % 100 == 0:
        print(f"Processing movie {idx}/{total} ({idx/total*100:.1f}%)")
    features = extractor.extract_features(imdb_id)
    if features:
        movie_features[netflix_id] = features


## Prune Features
- Remove features below 20 occurrences
- Assign new sequential id's to the features instead of using IMDB id's

In [5]:
MIN_OCCURRENCES = 20

In [None]:
print("Pruning features...")
features_to_keep = {
    feature for feature, count in extractor.feature_counts.items() 
    if count >= MIN_OCCURRENCES
}

# Rebuild the feature IDs with new sequential IDs
feature_to_id = {
    feature: idx for idx, feature in enumerate(sorted(features_to_keep), start=1)
}

pruned_features = {}
for netflix_id, old_feature_ids in movie_features.items():
    # Map old feature IDs to their feature names and then to new IDs
    kept_features = {
        feature_to_id[feature_name]  # Get new ID
        for feature_name, old_id in extractor.feature_to_id.items()  # Get name from old ID
        if feature_name in features_to_keep and old_id in old_feature_ids  # Check both conditions
    }
    if kept_features:
        pruned_features[netflix_id] = sorted(kept_features)


## Save Features

In [None]:
# Create output directory if it doesn't exist
output_dir = os.path.join(DATA_PATH, "processed")
os.makedirs(output_dir, exist_ok=True)

print("Saving results...")
with open(os.path.join(output_dir, f'feature_mapping_{MIN_OCCURRENCES}.pickle'), 'wb') as f:
    pickle.dump({
        'feature_to_id': feature_to_id,
        'id_to_feature': {v: k for k, v in feature_to_id.items()}
    }, f)

with open(os.path.join(output_dir, f'movie_features_{MIN_OCCURRENCES}.pickle'), 'wb') as f:
    pickle.dump(pruned_features, f)

print(f"Done! Processed {len(pruned_features)} movies with {len(feature_to_id)} features")
print(f"Results saved in {output_dir}")


## Save Feature Counts

In [15]:
def save_feature_frequencies(extractor, feature_to_id, output_dir):
    feature_frequencies = {}
    for feature_name, count in extractor.feature_counts.items():
        if feature_name in feature_to_id:
            new_id = feature_to_id[feature_name]
            feature_frequencies[new_id] = count

    frequencies_path = os.path.join(output_dir, f'feature_frequencies_{MIN_OCCURRENCES}.pickle')
    with open(frequencies_path, 'wb') as f:
        pickle.dump(feature_frequencies, f)

save_feature_frequencies(extractor, feature_to_id, output_dir)


## Import the Data (Sanity Check)

In [8]:
feature_mapping = pd.read_pickle(f'{DATA_PATH}/processed/feature_mapping_{MIN_OCCURRENCES}.pickle')
feature_to_id = feature_mapping['feature_to_id']
# movie_features = pd.read_pickle(f'data/processed/movie_features_{MIN_OCCURRENCES}.pickle')

# Map the most popular feature IDs to their features and print
feature_frequencies = pd.read_pickle(f'{DATA_PATH}/processed/feature_frequencies_{MIN_OCCURRENCES}.pickle')
sorted_features = sorted(feature_frequencies.items(), key=lambda x: x[1], reverse=True)
id_to_feature = {v: k for k, v in feature_to_id.items()}

for feature_id, count in sorted_features:
    print(f"{id_to_feature[feature_id]}: {count} occurrences")

Runtime:90-120min: 6013 occurrences
Genre:Drama: 5308 occurrences
Genre:Comedy: 3473 occurrences
Decade:1990s: 2906 occurrences
Decade:2000s: 2782 occurrences
Genre:Romance: 2062 occurrences
Genre:Crime: 1748 occurrences
Runtime:60-90min: 1732 occurrences
Genre:Action: 1723 occurrences
Runtime:>120min: 1601 occurrences
Decade:1980s: 1380 occurrences
Genre:Thriller: 1223 occurrences
Genre:Adventure: 1112 occurrences
Genre:Horror: 931 occurrences
Decade:1970s: 863 occurrences
Genre:Mystery: 697 occurrences
Decade:1960s: 601 occurrences
Genre:Fantasy: 550 occurrences
Genre:Sci-Fi: 540 occurrences
Genre:Family: 491 occurrences
Genre:Documentary: 442 occurrences
Decade:1950s: 439 occurrences
Genre:Biography: 425 occurrences
Genre:Music: 365 occurrences
Genre:War: 297 occurrences
Genre:History: 293 occurrences
Decade:1940s: 270 occurrences
Genre:Musical: 261 occurrences
Genre:Animation: 210 occurrences
Genre:Sport: 201 occurrences
Genre:Western: 188 occurrences
Decade:1930s: 144 occurrences
