# EDA: Heatmaps
In this notebook, we use heatmaps to analyze correlations between features. 

**User Preference Features**<br>
Intuitively, we interpret inter-feature preference correlations as "users who like feature x are also likely to enjoy feature y."
- Strong preference correlation between genres like Sci-Fi, Thriller, and Action
- Composers have the strongest human feature correlations, especially Jerry Goldsmith and Action/Thriller movies. Similarly, composers James Newton Howard and John Williams.
- Robert DeNiro has the strongest preference correlation with >120-minute movies, Drama and Thriller.
- Danny DeVito has the strongest preference correlation 90-120-minute movies

**Movie Features** <br>
Similarly, we interpret inter-feature correlations as "movies with feature x are also likely to contain feature y"
- Naturally, the strongest correlations are between runtime, genre, and release decade. Therefore, we analyze the correlations without these features
- We find strong correlations between famous producer-composer-actor collaborations such as John Wayne and Elmer Bernstein

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

DATA_PATH = "../data"
MIN_OCCURRENCES = 20

## Load Data

In [2]:
# Load feature mapping
feature_mapping_path = os.path.join(DATA_PATH, f"processed/feature_mapping_{MIN_OCCURRENCES}.pickle")
with open(feature_mapping_path, 'rb') as f:
    feature_mapping = pickle.load(f)

feature_to_id = feature_mapping['feature_to_id']
id_to_feature = feature_mapping['id_to_feature']

# Load user profiles
user_profiles_path = os.path.join(DATA_PATH, f"processed/user_profiles_{MIN_OCCURRENCES}.pickle")
with open(user_profiles_path, 'rb') as f:
    user_profiles = pickle.load(f)

## Utils

In [3]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(df.shape[0]):
        for j in range(i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_correlations(corr_matrix, n=5):
    '''Get top n pairs with highest correlations'''
    au_corr = corr_matrix.unstack()
    labels_to_drop = get_redundant_pairs(corr_matrix)
    au_corr = au_corr.drop(labels=labels_to_drop)
    sorted_corr = au_corr.sort_values(ascending=False)
    return sorted_corr[0:n]

def get_bottom_correlations(corr_matrix, n=5):
    '''Get top n pairs with lowest correlations'''
    au_corr = corr_matrix.unstack()
    labels_to_drop = get_redundant_pairs(corr_matrix)
    au_corr = au_corr.drop(labels=labels_to_drop)
    sorted_corr = au_corr.sort_values()
    return sorted_corr[0:n]

def plot_heatmap(corr_matrix, title):
    '''Plot a heatmap of the correlation matrix'''
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title(title)
    plt.show()

def print_top_correlations(corr_matrix, n=20, exclude_negative=False, description=''):
    '''Print top positive and negative correlations'''
    top_corrs = get_top_correlations(corr_matrix, n)
    bottom_corrs = get_bottom_correlations(corr_matrix, n)

    print(f"\nTop {n} strongest positive correlations {description}:")
    for (feature1, feature2), corr_value in top_corrs.items():
        if exclude_negative and corr_value <= 0:
            continue
        print(f"{feature1} & {feature2}: Correlation = {corr_value:.2f}")

    print(f"\nTop {n} strongest negative correlations {description}:")
    for (feature1, feature2), corr_value in bottom_corrs.items():
        if exclude_negative and corr_value >= 0:
            continue
        print(f"{feature1} & {feature2}: Correlation = {corr_value:.2f}")

# User Preference Feature Correlations

In [4]:
# Build DataFrame of user feature preferences
rows = []
for user_id, profile in user_profiles.items():
    prefs = profile['feature_preferences']
    row = {'user_id': user_id}
    for feature_id, ratio in prefs.items():
        if ratio >= 0:
            row[feature_id] = ratio
    rows.append(row)

user_data = pd.DataFrame(rows).set_index('user_id')

In [None]:
k = 20  # Number of features to select
feature_counts = user_data.count()
top_features = feature_counts.sort_values(ascending=False).head(k).index
filtered_data = user_data[top_features]
corr_matrix = filtered_data.corr()

# Map feature IDs to names
feature_names = [id_to_feature[feature_id] for feature_id in top_features]
corr_matrix.index = feature_names
corr_matrix.columns = feature_names

plot_heatmap(corr_matrix, 'Correlations Among Most Popular Features')

# User Feature Preference Correlations

In [None]:
# Compute the full correlation matrix
full_corr_matrix = user_data.corr()

# Get top N strongest correlations by absolute value
N = 20
au_corr = full_corr_matrix.unstack()
labels_to_drop = get_redundant_pairs(full_corr_matrix)
au_corr = au_corr.drop(labels=labels_to_drop)
strongest_pairs = au_corr.abs().sort_values(ascending=False).head(N)

# Extract unique feature IDs from the strongest pairs
feature_ids = set()
for (feature1, feature2) in strongest_pairs.index:
    feature_ids.update([feature1, feature2])

filtered_data = user_data[list(feature_ids)]
corr_matrix = filtered_data.corr()

# Map feature IDs to names
feature_names = [id_to_feature[feature_id] for feature_id in corr_matrix.columns]
corr_matrix.index = feature_names
corr_matrix.columns = feature_names

plot_heatmap(corr_matrix, 'Heatmap of Strongest Feature Preference Correlations')

# Movie Feature Correlations 

In [7]:
movie_features_path = os.path.join(DATA_PATH, f"processed/movie_features_{MIN_OCCURRENCES}.pickle")
with open(movie_features_path, 'rb') as f:
    movie_features = pickle.load(f)

# Build the movie-feature matrix
data = []
movie_ids = []
for movie_id, feature_ids in movie_features.items():
    movie_ids.append(movie_id)
    feature_presence = {fid: 1 for fid in feature_ids}
    data.append(feature_presence)

movie_data = pd.DataFrame.from_records(data, index=movie_ids)
movie_data = movie_data.fillna(0).astype(int)

In [None]:
k = 20
feature_counts = movie_data.sum()
top_features = feature_counts.sort_values(ascending=False).head(k).index
df_top = movie_data[top_features]
corr_matrix = df_top.corr()

# Map feature IDs to names
feature_names = [id_to_feature[feature_id] for feature_id in top_features]
corr_matrix.index = feature_names
corr_matrix.columns = feature_names

# Plot heatmap
plot_heatmap(corr_matrix, 'Correlations among Movie Features')

# Print top correlations
print_top_correlations(corr_matrix, n=k)

## Without Genre, Decade, or Runtime

In [None]:
# Exclude genre,decade,runtime
excluded_feature_ids = []
for feature_id, feature_name in id_to_feature.items():
    if feature_name.startswith(('Runtime', 'Genre', 'Decade')):
        excluded_feature_ids.append(feature_id)

movie_data_filtered = movie_data.drop(columns=excluded_feature_ids, errors='ignore')

# Recompute feature frequencies and select top features
feature_counts = movie_data_filtered.sum()
k = 15
top_features = feature_counts.sort_values(ascending=False).head(k).index
df_top = movie_data_filtered[top_features]
corr_matrix = df_top.corr()

# Map feature IDs to names
feature_names = [id_to_feature[feature_id] for feature_id in top_features]
corr_matrix.index = feature_names
corr_matrix.columns = feature_names

plot_heatmap(corr_matrix, 'Correlations among Movie Features (Excluding Runtime, Genre, Decade)')
print_top_correlations(corr_matrix, n=20, description='(excluding Runtime, Genre, Decade)')