In [5]:
import pickle
import pandas as pd
import altair as alt
from collections import Counter
import re
import os

DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")
MIN_OCCURRENCES = 20

In [3]:
def load_data(data_path, min_occurrences):
    """Load the feature mapping and movie features data."""
    # Load feature mappings
    with open(f'{data_path}/processed/feature_mapping_{min_occurrences}.pickle', 'rb') as f:
        feature_mapping = pickle.load(f)
    
    # Load movie features
    with open(f'{data_path}/processed/movie_features_{min_occurrences}.pickle', 'rb') as f:
        movie_features = pickle.load(f)
    
    return feature_mapping, movie_features

def calculate_feature_frequencies(movie_features, feature_mapping):
    """Calculate frequency of each feature across all movies."""
    # Flatten all feature IDs
    all_features = [feat for feats in movie_features.values() for feat in feats]
    
    # Count frequencies
    feature_counts = Counter(all_features)
    
    # Create DataFrame with feature details
    feature_freq_df = pd.DataFrame([
        {
            'feature_id': feat_id,
            'feature_name': feature_mapping['id_to_feature'][feat_id],
            'frequency': count
        }
        for feat_id, count in feature_counts.items()
    ])
    
    # Add feature category
    feature_freq_df['category'] = feature_freq_df['feature_name'].apply(
        lambda x: x.split(':')[0] if ':' in x else 'Other'
    )
    
    return feature_freq_df

def plot_top_features(feature_freq_df, top_n=30):
    """Plot top N most frequent features."""
    chart = alt.Chart(
        feature_freq_df.nlargest(top_n, 'frequency')
    ).mark_bar().encode(
        x=alt.X('frequency:Q', title='Number of Movies'),
        y=alt.Y('feature_name:N', sort='-x', title='Feature'),
        color='category:N',
        tooltip=['feature_name', 'frequency', 'category']
    ).properties(
        title=f'Top {top_n} Most Frequent Features',
        width=800,
        height=500
    )
    
    return chart

def plot_category_distributions(feature_freq_df):
    """Plot feature frequency distributions by category."""
    # Calculate statistics per category
    category_stats = feature_freq_df.groupby('category').agg({
        'frequency': ['count', 'mean', 'median', 'max']
    }).reset_index()
    
    category_stats.columns = ['category', 'count', 'mean', 'median', 'max']
    
    # Create box plot
    box_plot = alt.Chart(feature_freq_df).mark_boxplot().encode(
        x=alt.X('frequency:Q', scale=alt.Scale(type='log'), title='Frequency (log scale)'),
        y=alt.Y('category:N', title='Feature Category'),
        color='category:N',
        tooltip=['category', 'frequency']
    ).properties(
        title='Feature Frequency Distribution by Category',
        width=800,
        height=400
    )
    
    return box_plot

def analyze_specific_roles(feature_freq_df, role, top_n=20):
    """Analyze top contributors for specific roles (Writer/Director/Composer)."""
    role_features = feature_freq_df[feature_freq_df['feature_name'].str.startswith(f'{role}:')]
    
    # Create bar chart for top contributors
    chart = alt.Chart(
        role_features.nlargest(top_n, 'frequency')
    ).mark_bar().encode(
        x=alt.X('frequency:Q', title='Number of Movies'),
        y=alt.Y(
            'feature_name:N', 
            sort='-x', 
            title=f'Top {role}s',
            axis=alt.Axis(labelLimit=200)  # Allow longer labels
        ),
        tooltip=['feature_name', 'frequency']
    ).properties(
        title=f'Top {top_n} Most Frequent {role}s',
        width=800,
        height=500
    )
    
    return chart

# Function to run all analyses
def run_analysis(data_path, min_occurrences):
    """Run complete feature frequency analysis."""
    # Load data
    feature_mapping, movie_features = load_data(data_path, min_occurrences)
    
    # Calculate frequencies
    feature_freq_df = calculate_feature_frequencies(movie_features, feature_mapping)
    
    # Generate plots
    top_features_plot = plot_top_features(feature_freq_df)
    category_dist_plot = plot_category_distributions(feature_freq_df)
    
    # Analyze specific roles
    writer_plot = analyze_specific_roles(feature_freq_df, 'Writer')
    director_plot = analyze_specific_roles(feature_freq_df, 'Director')
    composer_plot = analyze_specific_roles(feature_freq_df, 'Composer')
    
    return {
        'feature_frequencies': feature_freq_df,
        'plots': {
            'top_features': top_features_plot,
            'category_distribution': category_dist_plot,
            'top_writers': writer_plot,
            'top_directors': director_plot,
            'top_composers': composer_plot
        }
    }

In [6]:
results = run_analysis(DATA_PATH, MIN_OCCURRENCES)
feature_freq_df = results['feature_frequencies']

# Display plots
results['plots']['top_features'].show()
results['plots']['category_distribution'].show()
results['plots']['top_writers'].show()
results['plots']['top_directors'].show()
results['plots']['top_composers'].show()