# EDA: User Preference Features

In [37]:
import os
import pickle
import pandas as pd
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
from scipy import stats

DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")
MIN_OCCURRENCES = 20

In [50]:
def load_data():
    """Load user profiles and feature mapping data"""
    with open(f'{DATA_PATH}/user_profiles_{MIN_OCCURRENCES}.pickle', 'rb') as f:
        user_profiles = pickle.load(f)
    
    with open(f'{DATA_PATH}/processed/feature_mapping_{MIN_OCCURRENCES}.pickle', 'rb') as f:
        feature_mapping = pickle.load(f)
    
    return user_profiles, feature_mapping

def create_ratings_distribution(user_profiles, top_k=50):
    """Create stacked bar plot for high/low ratings distribution"""
    # Convert user profiles to DataFrame
    ratings_data = pd.DataFrame([
        {
            'user_id': user_id,
            'high_ratings': data['high_ratings'],
            'low_ratings': data['low_ratings'],
            'total_ratings': data['total_ratings']
        }
        for user_id, data in user_profiles.items()
    ])
    
    # Sort by total ratings and get top k users
    ratings_data = ratings_data.sort_values('total_ratings', ascending=False).head(top_k)
    
    # Create long-format DataFrame for Altair
    ratings_long = pd.melt(
        ratings_data,
        id_vars=['user_id'],
        value_vars=['high_ratings', 'low_ratings'],
        var_name='rating_type',
        value_name='count'
    )
    
    # Create Altair chart
    chart = alt.Chart(ratings_long).mark_bar().encode(
        x=alt.X('user_id:N', sort='-y', axis=alt.Axis(labels=False, title='Users (sorted by total ratings)')),
        y=alt.Y('count:Q', stack='zero'),
        color=alt.Color('rating_type:N', scale=alt.Scale(
            domain=['high_ratings', 'low_ratings'],
            range=['#2ecc71', '#e74c3c']
        )),
        tooltip=['user_id', 'rating_type', 'count']
    ).properties(
        title='Ratings Distribution for Top Users',
        width=800,
        height=400
    )
    
    return chart

def analyze_feature_popularity(user_profiles, feature_mapping, top_k=30):
    """Analyze and visualize feature popularity distribution"""
    # Calculate ratio counts for each feature
    feature_counts = {
        'high_ratio': {},  # 0.5-1.0
        'low_ratio': {}    # 0.0-0.5
    }
    
    for user_data in user_profiles.values():
        for feature_id, ratio in user_data['feature_preferences'].items():
            if ratio != -1:
                if feature_id not in feature_counts['high_ratio']:
                    feature_counts['high_ratio'][feature_id] = 0
                    feature_counts['low_ratio'][feature_id] = 0
                
                if ratio >= 0.5:
                    feature_counts['high_ratio'][feature_id] += 1
                else:
                    feature_counts['low_ratio'][feature_id] += 1
    
    # Convert to DataFrame
    feature_data = []
    for feature_id in feature_counts['high_ratio'].keys():
        total_counts = feature_counts['high_ratio'][feature_id] + feature_counts['low_ratio'][feature_id]
        feature_name = feature_mapping['id_to_feature'][feature_id]
        feature_data.append({
            'feature_id': feature_id,
            'feature_name': feature_name,
            'feature_label': f'{feature_id}: {feature_name}',
            'high_ratio_count': feature_counts['high_ratio'][feature_id],
            'low_ratio_count': feature_counts['low_ratio'][feature_id],
            'total_counts': total_counts
        })
    
    df_features = pd.DataFrame(feature_data)
    df_features = df_features.sort_values('total_counts', ascending=False).head(top_k)
    
    # Create long format for Altair
    features_long = pd.melt(
        df_features,
        id_vars=['feature_id', 'feature_name', 'feature_label'],
        value_vars=['high_ratio_count', 'low_ratio_count'],
        var_name='ratio_type',
        value_name='count'
    )
    
    # Create Altair chart
    chart = alt.Chart(features_long).mark_bar().encode(
        x=alt.X('feature_id:N', 
                sort='-y', 
                axis=alt.Axis(
                    title='Feature IDs',
                    labels=True,
                    labelAngle=45
                )),
        y=alt.Y('count:Q', stack='zero'),
        color=alt.Color('ratio_type:N', scale=alt.Scale(
            domain=['high_ratio_count', 'low_ratio_count'],
            range=['#2ecc71', '#e74c3c']
        )),
        tooltip=[
            alt.Tooltip('feature_id:N', title='Feature ID'),
            alt.Tooltip('feature_name:N', title='Feature Name'),
            alt.Tooltip('ratio_type:N', title='Ratio Type'),
            alt.Tooltip('count:Q', title='Count')
        ]
    ).properties(
        title=alt.TitleParams(
            'Feature Popularity Distribution',
            subtitle='High ratio: 0.5-1.0, Low ratio: 0.0-0.5'
        ),
        width=800,
        height=400
    )
    
    # Add text labels for feature names
    text = alt.Chart(features_long).mark_text(
        angle=45,
        align='left',
        dx=5,
        dy=5
    ).encode(
        x=alt.X('feature_id:N', sort='-y'),
        y=alt.Y('count:Q', stack='zero'),
        # text='feature_name:N',
        color=alt.value('black'),
        opacity=alt.condition(
            alt.datum.ratio_type == 'low_ratio_count',  # Only show text for one layer
            alt.value(0),
            alt.value(1)
        )
    )
    
    # Print detailed feature statistics
    print("\nTop features by popularity:")
    print("\nFeature Statistics:")
    print(f"{'Feature ID':<10} {'Feature Name':<40} {'Total Count':<12} {'High Ratio':<12} {'Low Ratio':<12} {'High/Low Ratio':<15}")
    print("-" * 95)
    
    for _, row in df_features.iterrows():
        feature_id = row['feature_id']
        feature_name = row['feature_name']
        high_ratio = row['high_ratio_count']
        low_ratio = row['low_ratio_count']
        total_count = row['total_counts']
        ratio = high_ratio / low_ratio if low_ratio > 0 else float('inf')
        
        print(f"{feature_id:<10} {feature_name[:40]:<40} {total_count:<12} {high_ratio:<12} {low_ratio:<12} {ratio:.2f}")
    
    return chart + text

def analyze_feature_ratio_distribution(user_profiles):
    """Analyze distribution of non-negative feature ratio counts"""
    # Count how many non-negative features each user has
    user_feature_counts = []
    for user_data in user_profiles.values():
        non_negative_count = sum(1 for ratio in user_data['feature_preferences'].values() if ratio != -1)
        user_feature_counts.append(non_negative_count)
    
    # Calculate statistics
    print("\nFeature Ratio Distribution Statistics:")
    print(f"Total number of users: {len(user_feature_counts)}")
    print(f"Mean features per user: {np.mean(user_feature_counts):.2f}")
    print(f"Median features per user: {np.median(user_feature_counts)}")
    print(f"Mode features per user: {stats.mode(user_feature_counts).mode}")
    print(f"Standard deviation: {np.std(user_feature_counts):.2f}")
    print(f"Min features per user: {np.min(user_feature_counts)}")
    print(f"Max features per user: {np.max(user_feature_counts)}")
    # Create percentile statistics
    percentiles = [10, 25, 50, 75, 90]
    print("\nPercentile Distribution:")
    for p in percentiles:
        value = np.percentile(user_feature_counts, p)
        print(f"{p}th percentile: {value:.1f} features")
    
    # Create DataFrame
    df_counts = pd.DataFrame(user_feature_counts, columns=['feature_count'])
    df_counts = df_counts.groupby('feature_count').size().reset_index(name='user_count')
    
    # Create Altair chart
    chart = alt.Chart(df_counts).mark_bar().encode(
        x=alt.X('feature_count:Q', bin=False, title='Number of Non-negative Features'),
        y=alt.Y('user_count:Q', title='Number of Users'),
        tooltip=['feature_count', 'user_count']
    ).properties(
        title='Distribution of Non-negative Feature Counts per User',
        width=800,
        height=400
    )
    
    return chart

In [46]:
user_profiles, feature_mapping = load_data()

ratings_dist = create_ratings_distribution(user_profiles)
ratings_dist

In [51]:
feature_pop = analyze_feature_popularity(user_profiles, feature_mapping)
feature_pop


Top features by popularity:

Feature Statistics:
Feature ID Feature Name                             Total Count  High Ratio   Low Ratio    High/Low Ratio 
-----------------------------------------------------------------------------------------------
436        Runtime:90-120min                        9732         5204         4528         1.15
390        Genre:Drama                              9702         5143         4559         1.13
374        Decade:2000s                             8930         4213         4717         0.89
387        Genre:Comedy                             8719         4756         3963         1.20
373        Decade:1990s                             8552         4440         4112         1.08
383        Genre:Action                             8296         3535         4761         0.74
438        Runtime:>120min                          7972         3493         4479         0.78
384        Genre:Adventure                          7854         3809      

In [41]:
feature_ratio_dist = analyze_feature_ratio_distribution(user_profiles)
feature_ratio_dist


Feature Ratio Distribution Statistics:
Total number of users: 10000
Mean features per user: 50.39
Median features per user: 27.0
Mode features per user: 11
Standard deviation: 61.29
Min features per user: 3
Max features per user: 407

Percentile Distribution:
10th percentile: 11.0 features
25th percentile: 15.0 features
50th percentile: 27.0 features
75th percentile: 55.0 features
90th percentile: 124.0 features
