# EDA: User Preference Features
In this notebook we analyze the engineered user preference features, specifically the distribution of high and low ratings amongst features and the sparsity of the user preference feature vectors.

<!-- We find that the distribution of positive and negative ratings is evenly balanced amongst the most popular features. -->
We find that the user feature vectors are sufficiently dense, median 420/460 features filled in. Many features have been rated by all top 10k users

We rank features by highest count of preference (above 0.5), as well as the highest like-dislike proportion across the 10k users

In [None]:
import os
import pickle
import pandas as pd
import altair as alt

# Constants
DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")
MIN_OCCURRENCES = 20

## Load Data

In [50]:
# Load feature mapping
feature_mapping_path = os.path.join(
    DATA_PATH, f"processed/feature_mapping_{MIN_OCCURRENCES}.pickle"
)
with open(feature_mapping_path, "rb") as f:
    feature_mapping = pickle.load(f)
feature_to_id = feature_mapping["feature_to_id"]
id_to_feature = feature_mapping["id_to_feature"]

# Load user profiles
user_profiles_path = os.path.join(
    DATA_PATH, f"processed/user_profiles_{MIN_OCCURRENCES}.pickle"
)
with open(user_profiles_path, "rb") as f:
    user_profiles = pickle.load(f)

## Plot Top 20 Features By User Preference

In [52]:
def plot_feature_popularity_distribution(
    user_profiles, id_to_feature, k=20, sort_by='count'
):
    """
    Plots the feature popularity distribution.

    Parameters:
    - user_profiles: dict of user profiles
    - id_to_feature: dict mapping feature IDs to feature names
    - k: number of top features to display
    - sort_by: 'count' to sort by count of users with high preference ratio,
               'proportion' to sort by proportion of users with high preference ratio
    """
    # Compute counts
    feature_user_ratio_counts = {}
    
    for user_id, profile in user_profiles.items():
        feature_prefs = profile["feature_preferences"]
        for feature_id, ratio in feature_prefs.items():
            if ratio == -1:
                continue
            if feature_id not in feature_user_ratio_counts:
                feature_user_ratio_counts[feature_id] = {
                    "[0,0.5)": 0,
                    "[0.5,1]": 0,
                    "non_minus_one": 0,
                    "proportion_high_pref": 0,
                }
            feature_user_ratio_counts[feature_id]["non_minus_one"] += 1
            if ratio < 0.5:
                feature_user_ratio_counts[feature_id]["[0,0.5)"] += 1
            elif ratio >= 0.5:
                feature_user_ratio_counts[feature_id]["[0.5,1]"] += 1

    # Calculate proportions
    for counts in feature_user_ratio_counts.values():
        total_users = counts["non_minus_one"]
        high_pref_users = counts["[0.5,1]"]
        counts["proportion_high_pref"] = high_pref_users / total_users if total_users > 0 else 0

    # Prepare list for sorting
    feature_counts_list = [
        (feature_id, counts) for feature_id, counts in feature_user_ratio_counts.items()
    ]

    # Sort features
    if sort_by == 'proportion':
        # Sort by proportion of high preference users
        feature_counts_list.sort(
            key=lambda x: x[1]["proportion_high_pref"], reverse=True
        )
        title_suffix = 'High Preference Proportion'
    elif sort_by == 'count':
        # Sort by count of users with high preference ratios
        feature_counts_list.sort(key=lambda x: x[1]["[0.5,1]"], reverse=True)
        title_suffix = 'High Preference Count'
    else:
        raise ValueError("sort_by must be 'proportion' or 'count'")

    # Get top k features
    top_k_features = feature_counts_list[:k]

    plot_data = []
    for feature_id, counts in top_k_features:
        feature_name = id_to_feature[feature_id]
        for ratio_bin in ["[0,0.5)", "[0.5,1]"]:
            count = counts[ratio_bin]
            plot_data.append(
                {
                    "feature_id": feature_id,
                    "feature_name": feature_name,
                    "ratio_bin": ratio_bin,
                    "count": count,
                }
            )

    df_plot = pd.DataFrame(plot_data)

    # Get feature names in order
    feature_names_ordered = [id_to_feature[feature_id] for feature_id, _ in top_k_features]

    # Stack the bar plots
    chart = (
        alt.Chart(df_plot)
        .mark_bar()
        .encode(
            x=alt.X(
                "feature_name:N",
                sort=feature_names_ordered,
                axis=alt.Axis(title="Feature"),
            ),
            y=alt.Y("count:Q", axis=alt.Axis(title="Number of Users")),
            color=alt.Color(
                "ratio_bin:N", legend=alt.Legend(title="Preference Ratio Bin")
            ),
            order=alt.Order("ratio_bin", sort="ascending"),
            tooltip=[
                alt.Tooltip("feature_name:N", title="Feature"),
                alt.Tooltip("ratio_bin:N", title="Preference Ratio Bin"),
                alt.Tooltip("count:Q", title="Number of Users"),
            ],
        )
        .properties(
            width=800,
            height=400,
            title=f"Feature Popularity Distribution (Top {k} Features by {title_suffix})",
        )
        .configure_axis(labelAngle=-45)
    )

    chart.display()

# Plot sorted by count of users with high preference ratios
plot_feature_popularity_distribution(user_profiles, id_to_feature, k=20, sort_by='count')
# Plot sorted by proportion of high preference users
plot_feature_popularity_distribution(user_profiles, id_to_feature, k=20, sort_by='proportion')

## Feature Vector Sparsity Distribution
- Distribution of non -1 feature ratio counts across the 10k users

In [53]:
# Task 2: Distribution of non -1 feature ratio counts per user
non_minus_one_counts = []

for user_id, profile in user_profiles.items():
    feature_prefs = profile["feature_preferences"]
    num_non_minus_one = sum(1 for ratio in feature_prefs.values() if ratio != -1)
    non_minus_one_counts.append(num_non_minus_one)

counts_series = pd.Series(non_minus_one_counts)
counts_per_n = counts_series.value_counts().sort_index()

df_plot2 = counts_per_n.reset_index()
df_plot2.columns = ["num_features", "num_users"]

# Plotting the distribution
chart2 = (
    alt.Chart(df_plot2)
    .mark_bar()
    .encode(
        x=alt.X(
            "num_features:Q",
            axis=alt.Axis(title="Number of Features with Preference Ratio Across Top 10k Users"),
        ),
        y=alt.Y("num_users:Q", axis=alt.Axis(title="Number of Users")),
    )
    .properties(
        width=800,
        height=400,
        title="Number of Features with Preference Ratio",
    )
)

# Display the chart
chart2.display()