In [3]:
import pandas as pd
import numpy as np
import polars as pl
import lib.models.content_based_2 as content_based
import lib.eval as eval
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

import random
import time

import importlib
importlib.reload(eval)
importlib.reload(content_based)

<module 'lib.models.content_based_2' from '/Users/mathiasraa/Desktop/ntnu/recommender-systems/src/lib/models/content_based_2.py'>

In [4]:
behavior_polars_train = pl.read_csv("../data/MINDlarge_train/behaviors.tsv", separator='\t', has_header=False)
behavior_polars_dev = pl.read_csv("../data/MINDlarge_dev/behaviors.tsv", separator='\t', has_header=False)
behavior_polars_train.columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']
behavior_polars_dev.columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']


In [5]:
news_train = pl.read_csv("../data/MINDlarge_train/news.tsv", separator='\t', has_header=False, quote_char=None)
news_dev = pl.read_csv("../data/MINDlarge_dev/news.tsv", separator='\t', has_header=False, quote_char=None)
news_train.columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
news_dev.columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

## Feature-ing

In [6]:
def preprocess_behaviors(behaviors_pl):
    """
    Preprocess the behaviors data.
    """
    behaviors_pl = behaviors_pl.with_columns(
        pl.col("time").str.to_datetime("%m/%d/%Y %I:%M:%S %p")
    )
    
    # Filter rows with valid impressions
    behaviors_pl = behaviors_pl.filter(
        ~pl.col("impressions").is_null() & (pl.col("impressions") != "")
    )
    
    # Split the impressions string into a list column
    with_splits = behaviors_pl.with_columns(
        pl.col("impressions").str.split(by=" ").alias("impression_list")
    )
    
    exploded = with_splits.explode("impression_list")
    
    # Extract news_id and click from impression string
    processed = exploded.with_columns([
        pl.col("impression_list").str.split("-").list.get(0).alias("news_id"),
        pl.col("impression_list").str.split("-").list.get(1).cast(pl.Int32, strict=False).alias("click"),
    ])
    
    # Select only valid entries and necessary columns
    result = processed.filter(
        ~pl.col("news_id").is_null() & ~pl.col("click").is_null()
    ).select([
        "user_id", "impression_id", "time", "news_id", "click"
    ])
    
    return result


def create_interaction_matrix(interactions_df):
    """
    Create a user-item interaction matrix.
    """
    # Filter to only clicked items (click=1)
    clicked_interactions = interactions_df[interactions_df['click'] == 1]
    
    user_ids = clicked_interactions['user_id'].unique()
    news_ids = clicked_interactions['news_id'].unique()
    
    user_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
    news_map = {news_id: idx for idx, news_id in enumerate(news_ids)}
    
    # Create the interaction matrix
    rows = clicked_interactions['user_id'].map(user_map).values
    cols = clicked_interactions['news_id'].map(news_map).values
    values = np.ones(len(rows))
    
    interaction_matrix = csr_matrix((values, (rows, cols)), 
                                   shape=(len(user_ids), len(news_ids)))
    
    return interaction_matrix, user_map, news_map

def compute_item_similarity(interaction_matrix):
    """
    Compute item-item similarity using cosine similarity.
    """
    # Transpose to get item features (users who interacted with each item)
    item_features = interaction_matrix.T

    return cosine_similarity(item_features)

## Revised model

In [7]:
def create_hybrid_similarity_matrix(interaction_matrix, news_df, news_map):
    """
    Create a hybrid similarity matrix incorporating both collaborative signals
    and content-based similarity.
    """
    # Create content features from news categories and subcategories
    news_ids = list(news_map.keys())
    n_news = len(news_ids)
    
    # Create one-hot encoding for categories and subcategories
    categories = news_df['category'].unique().tolist()
    subcategories = news_df['subcategory'].unique().tolist()
    
    cat_map = {cat: i for i, cat in enumerate(categories)}
    subcat_map = {subcat: i for i, subcat in enumerate(subcategories)}
    
    # Initialize feature matrix
    # Features: [category_oh, subcategory_oh]
    feat_matrix = np.zeros((n_news, len(categories) + len(subcategories)))
    
    for news_id, idx in news_map.items():
        news_info = news_df[news_df['news_id'] == news_id]
        if len(news_info) > 0:
            # Add category one-hot
            cat = news_info.iloc[0]['category']
            if cat in cat_map:
                feat_matrix[idx, cat_map[cat]] = 1.0
                
            # Add subcategory one-hot
            subcat = news_info.iloc[0]['subcategory']
            if subcat in subcat_map:
                feat_matrix[idx, len(categories) + subcat_map[subcat]] = 1.0
    
    # Compute content-based similarity
    content_similarity = cosine_similarity(feat_matrix)
    # Compute collaborative similarity
    collaborative_similarity = cosine_similarity(interaction_matrix.T)
    
    # Weighted hybrid similarity
    hybrid_similarity = 0.7 * collaborative_similarity + 0.3 * content_similarity
    
    return hybrid_similarity

In [8]:
def sample_and_evaluate_hybrid(test_interactions, interaction_matrix, similarity_matrix, 
                       user_map, news_map, sampled_users):
    """
    Sample users and evaluate the model
    """
    test_sample = test_interactions.filter(pl.col("user_id").is_in(sampled_users))
    
    # Create a map to look up user_id by impression_id 
    impression_to_user = {row["impression_id"]: row["user_id"] 
                          for row in test_sample.select(["impression_id", "user_id"]).unique().iter_rows(named=True)}

    # Generate predictions
    predictions = []
    for row in test_sample.select(["impression_id", "news_id"]).unique().iter_rows(named=True):
        impression_id = row["impression_id"]
        news_id = row["news_id"]
        
        # Skip if impression or news not in maps
        if impression_id not in impression_to_user or news_id not in news_map:
            continue
        
        user_id = impression_to_user[impression_id]
        if user_id not in user_map:
            continue
            
        user_idx = user_map[user_id]
        news_idx = news_map[news_id]
        
        # Get user interactions
        user_interactions = interaction_matrix[user_idx].toarray().flatten()
        interacted_indices = np.where(user_interactions > 0)[0]
        
        if len(interacted_indices) == 0:
            continue
            
        # Calculate score by combining collaborative and recency factors
        collab_score = 0.0
        for idx in interacted_indices:
            collab_score += similarity_matrix[idx, news_idx] * user_interactions[idx]
        
        
        # Add prediction
        predictions.append({
            "impression_id": impression_id,
            "news_id": news_id,
            "score": float(collab_score)
        })

    return pl.DataFrame(predictions)

## Getting some results

### Predictions 

In [None]:
def sample_and_evaluate(test_interactions, interaction_matrix, similarity_matrix, 
                       user_map, news_map, sampled_users):
    # Filter test data to only include sampled users
    test_sample = test_interactions.filter(pl.col("user_id").is_in(sampled_users))
    print(f"Sample contains {len(test_sample)} interactions")


    # Extract unique user-news pairs to score
    unique_pairs = test_sample.select(["impression_id", "user_id", "news_id"]).unique()
    print(f"Unique pairs to score: {len(unique_pairs)}")

    # Convert to pandas for easier processing
    pairs_pd = unique_pairs.to_pandas()

    # Initialize predictions list
    predictions = []

    batch_size = 1000
    num_batches = (len(pairs_pd) + batch_size - 1) // batch_size

    print("Generating predictions...")
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(pairs_pd))
        
        if batch_idx % 10 == 0:
            print(f"Processing batch {batch_idx+1}/{num_batches}")
        
        batch = pairs_pd.iloc[start_idx:end_idx]
        
        for _, row in batch.iterrows():
            user_id = row['user_id']
            news_id = row['news_id']
            impression_id = row['impression_id']
            
            # Skip if news not in training
            if news_id not in news_map:
                continue
                
            # Get user profile
            user_idx = user_map[user_id]
            news_idx = news_map[news_id]
            
            # Get user interactions
            user_interactions = interaction_matrix[user_idx].toarray().flatten()
            interacted_indices = np.where(user_interactions > 0)[0]
            
            if len(interacted_indices) == 0:
                continue
                
            # Calculate score
            score = 0.0
            for idx in interacted_indices:
                if idx < similarity_matrix.shape[0] and news_idx < similarity_matrix.shape[1]:
                    score += similarity_matrix[idx, news_idx] * user_interactions[idx]
            
            # Add prediction
            predictions.append({
                "impression_id": impression_id,
                "news_id": news_id,
                "score": float(score)
            })
    return pl.DataFrame(predictions)

### Results

In [10]:
news_df = news_train.to_pandas()
behaviors_df = behavior_polars_train.to_pandas()

train_interactions = preprocess_behaviors(behavior_polars_train)
test_interactions = preprocess_behaviors(behavior_polars_dev)

interaction_matrix, user_map, news_map = create_interaction_matrix(train_interactions.to_pandas())
item_similarity = compute_item_similarity(interaction_matrix)

In [None]:
def get_sampled_users(test_interactions, user_map, sample_size=3000):
    """
    Sample users who exist in both test and training data.
    """
    test_users = set(test_interactions["user_id"].unique().to_list())
    train_users = set(user_map.keys())
    valid_users = list(test_users.intersection(train_users))
    
    sample_size = min(sample_size, len(valid_users))
    sampled_users = random.sample(valid_users, sample_size)
    
    return sampled_users

sampled_users = get_sampled_users(test_interactions, user_map)

In [26]:
predictions_df = sample_and_evaluate(
    test_interactions,
    interaction_matrix,
    item_similarity,
    user_map,
    news_map,
    sampled_users
)

Sample contains 166836 interactions
Unique pairs to score: 166836
Generating predictions...
Processing batch 1/167
Processing batch 11/167
Processing batch 21/167
Processing batch 31/167
Processing batch 41/167
Processing batch 51/167
Processing batch 61/167
Processing batch 71/167
Processing batch 81/167
Processing batch 91/167
Processing batch 101/167
Processing batch 111/167
Processing batch 121/167
Processing batch 131/167
Processing batch 141/167
Processing batch 151/167
Processing batch 161/167


In [27]:
eval_results = eval.evaluate_mind_predictions(
    predictions_df,
    behaviors_df=behavior_polars_dev.filter(pl.col("user_id").is_in(sampled_users)),  # Only use the sampled test data
    metrics=["auc", "mrr", "ndcg@5", "ndcg@10"]
)

pd.DataFrame(eval_results, index=[0])

Unnamed: 0,auc,mrr,ndcg@5,ndcg@10
0,0.502965,0.239347,0.245703,0.297866


### Results from the revised model

In [28]:
hybrid_similarity = create_hybrid_similarity_matrix(interaction_matrix, news_df, news_map)

In [29]:
hybrid_predictions = sample_and_evaluate_hybrid(
    test_interactions, 
    interaction_matrix, 
    hybrid_similarity,
    user_map, 
    news_map, 
    sampled_users
)

In [None]:
eval_results = eval.evaluate_mind_predictions(
    hybrid_predictions,
    behaviors_df=behavior_polars_dev.filter(pl.col("user_id").is_in(sampled_users)), 
    metrics=["auc", "mrr", "ndcg@5", "ndcg@10"]
)

pd.DataFrame(eval_results, index=[0])

Unnamed: 0,auc,mrr,ndcg@5,ndcg@10
0,0.582429,0.289803,0.302928,0.349088
