In [1]:
import pandas as pd
import numpy as np
import polars as pl
import lib.models.content_based_2 as content_based
import lib.eval as eval
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt


import importlib
importlib.reload(eval)
importlib.reload(content_based)

<module 'lib.models.content_based_2' from '/Users/mathiasraa/Desktop/ntnu/recommender-systems/lib/models/content_based_2.py'>

In [2]:
behavior_polars_train = pl.read_csv("data/MINDlarge_train/behaviors.tsv", separator='\t', has_header=False)
behavior_polars_dev = pl.read_csv("data/MINDlarge_dev/behaviors.tsv", separator='\t', has_header=False)
behavior_polars_train.columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']
behavior_polars_dev.columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']


In [3]:
news_train = pl.read_csv("data/MINDlarge_train/news.tsv", separator='\t', has_header=False, quote_char=None)
news_dev = pl.read_csv("data/MINDlarge_dev/news.tsv", separator='\t', has_header=False, quote_char=None)
news_train.columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
news_dev.columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

In [None]:
def preprocess_behaviors(behaviors_df):
    """
    Fast processing of behaviors data using Polars' vectorized operations.
    
    Args:
        behaviors_df: DataFrame with impression logs (pandas or polars)
        
    Returns:
        Polars DataFrame with user-news interactions
    """
    # Convert to polars if needed
    if not isinstance(behaviors_df, pl.DataFrame):
        behaviors_pl = pl.from_pandas(behaviors_df)
    else:
        behaviors_pl = behaviors_df
    
    # Convert time to datetime
    behaviors_pl = behaviors_pl.with_columns(
        pl.col("time").str.to_datetime("%m/%d/%Y %I:%M:%S %p")
    )
    
    # Filter rows with valid impressions
    behaviors_pl = behaviors_pl.filter(
        ~pl.col("impressions").is_null() & (pl.col("impressions") != "")
    )
    
    # Split the impressions string into a list column
    with_splits = behaviors_pl.with_columns(
        pl.col("impressions").str.split(by=" ").alias("impression_list")
    )
    
    # Explode the list column
    exploded = with_splits.explode("impression_list")
    
    # Extract news_id and click from impression string
    processed = exploded.with_columns([
        pl.col("impression_list").str.split("-").list.get(0).alias("news_id"),
        pl.col("impression_list").str.split("-").list.get(1).cast(pl.Int32, strict=False).alias("click"),
    ])
    
    # Select only valid entries and necessary columns
    result = processed.filter(
        ~pl.col("news_id").is_null() & ~pl.col("click").is_null()
    ).select([
        "user_id", "impression_id", "time", "news_id", "click"
    ])
    
    return result


def create_interaction_matrix(interactions_df):
    """
    Create a user-item interaction matrix.
    
    Args:
        interactions_df: DataFrame with user_id, news_id, and click
        
    Returns:
        User-item matrix, user_map, item_map
    """
    # Filter to only clicked items (click=1)
    clicked_interactions = interactions_df[interactions_df['click'] == 1]
    
    # Create mappings for users and items to matrix indices
    user_ids = clicked_interactions['user_id'].unique()
    news_ids = clicked_interactions['news_id'].unique()
    
    user_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
    news_map = {news_id: idx for idx, news_id in enumerate(news_ids)}
    
    # Create the interaction matrix
    rows = clicked_interactions['user_id'].map(user_map).values
    cols = clicked_interactions['news_id'].map(news_map).values
    values = np.ones(len(rows))  # Binary interaction
    
    interaction_matrix = csr_matrix((values, (rows, cols)), 
                                   shape=(len(user_ids), len(news_ids)))
    
    return interaction_matrix, user_map, news_map

def compute_item_similarity(interaction_matrix):
    """
    Compute item-item similarity using cosine similarity.
    
    Args:
        interaction_matrix: User-item interaction matrix
        
    Returns:
        Item-item similarity matrix
    """
    # Transpose to get item features (users who interacted with each item)
    item_features = interaction_matrix.T
    
    # Compute cosine similarity between items
    item_similarity = cosine_similarity(item_features)
    
    return item_similarity

def compute_user_similarity(interaction_matrix):
    """
    Compute user-user similarity using cosine similarity.
    
    Args:
        interaction_matrix: User-item interaction matrix
        
    Returns:
        User-user similarity matrix
    """
    # Compute cosine similarity between users
    user_similarity = cosine_similarity(interaction_matrix)
    
    return user_similarity

def get_user_history(user_id, behaviors_df):
    """
    Get a user's news click history.
    
    Args:
        user_id: ID of the user
        behaviors_df: DataFrame with behavior data
        
    Returns:
        List of news IDs the user has clicked
    """
    # Get all history entries for the user
    user_behaviors = behaviors_df[behaviors_df['user_id'] == user_id]
    
    # Extract clicked news IDs from history
    clicked_news = set()
    for history in user_behaviors['history']:
        if isinstance(history, str) and history.strip():
            clicked_news.update(history.split())
    
    return list(clicked_news)

def generate_news_recommendations(user_id, interaction_matrix, item_similarity, 
                                user_map, news_map, n_recommendations=10):
    """
    Generate item-based collaborative filtering recommendations for news.
    
    Args:
        user_id: ID of the user to generate recommendations for
        interaction_matrix: User-item interaction matrix
        item_similarity: Item-item similarity matrix
        user_map: Mapping from user IDs to matrix indices
        news_map: Mapping from news IDs to matrix indices
        n_recommendations: Number of recommendations to generate
        
    Returns:
        List of recommended news IDs
    """
    # Check if user is in the training set
    if user_id not in user_map:
        return []
    
    # Map user_id to matrix index
    user_idx = user_map[user_id]
    
    # Get items the user has already interacted with
    user_interactions = interaction_matrix[user_idx].toarray().flatten()
    already_interacted = np.where(user_interactions > 0)[0]
    
    # If user hasn't interacted with any items, return empty list
    if len(already_interacted) == 0:
        return []
    
    # Reverse the news mapping to get original IDs
    reverse_news_map = {idx: news_id for news_id, idx in news_map.items()}
    
    # Calculate scores for each news item
    scores = np.zeros(interaction_matrix.shape[1])
    
    for item_idx in already_interacted:
        # Get similarity of this news to all other news
        similarity_scores = item_similarity[item_idx]
        # Add weighted similarities to scores
        scores += similarity_scores * user_interactions[item_idx]
    
    # Set scores of already interacted items to -1 to exclude them
    scores[already_interacted] = -1
    
    # Get top N recommendations
    top_items_idx = np.argsort(scores)[::-1][:n_recommendations]
    recommendations = [reverse_news_map[idx] for idx in top_items_idx]
    
    return recommendations


In [42]:
import random
import time


def create_hybrid_similarity_matrix(interaction_matrix, news_df, news_map):
    """
    Create a hybrid similarity matrix incorporating both collaborative signals
    and content-based similarity.
    """
    # Create content features from news categories and subcategories
    news_ids = list(news_map.keys())
    n_news = len(news_ids)
    
    # Create one-hot encoding for categories and subcategories
    categories = news_df['category'].unique().tolist()
    subcategories = news_df['subcategory'].unique().tolist()
    
    cat_map = {cat: i for i, cat in enumerate(categories)}
    subcat_map = {subcat: i for i, subcat in enumerate(subcategories)}
    
    # Initialize feature matrix
    # Features: [category_oh, subcategory_oh]
    feat_matrix = np.zeros((n_news, len(categories) + len(subcategories)))
    
    for news_id, idx in news_map.items():
        news_info = news_df[news_df['news_id'] == news_id]
        if len(news_info) > 0:
            # Add category one-hot
            cat = news_info.iloc[0]['category']
            if cat in cat_map:
                feat_matrix[idx, cat_map[cat]] = 1.0
                
            # Add subcategory one-hot
            subcat = news_info.iloc[0]['subcategory']
            if subcat in subcat_map:
                feat_matrix[idx, len(categories) + subcat_map[subcat]] = 1.0
    
    # Compute content-based similarity
    content_similarity = cosine_similarity(feat_matrix)
    
    # Compute collaborative similarity
    collaborative_similarity = cosine_similarity(interaction_matrix.T)
    
    # Create hybrid similarity (0.7 * collaborative + 0.3 * content)
    # Adjust these weights based on your dataset
    hybrid_similarity = 0.7 * collaborative_similarity + 0.3 * content_similarity
    
    return hybrid_similarity

def sample_and_evaluate(test_interactions, interaction_matrix, similarity_matrix, 
                       user_map, news_map, news_df, behaviors_df, sample_size=1000):
    """
    Sample users and evaluate the model
    """
    start_time = time.time()
    
    # Sample users who exist in both test and training
    test_users = set(test_interactions["user_id"].unique().to_list())
    train_users = set(user_map.keys())
    valid_users = list(test_users.intersection(train_users))
    
    sample_size = min(sample_size, len(valid_users))
    sampled_users = random.sample(valid_users, sample_size)
    
    # Filter test data to only include sampled users
    test_sample = test_interactions.filter(pl.col("user_id").is_in(sampled_users))

    test_actual_sample = behavior_polars_dev.filter(pl.col("user_id").is_in(sampled_users))
    
    # Extract unique impression IDs in the sample
    impression_ids = test_sample["impression_id"].unique()
    
    # Create a map to look up user_id by impression_id 
    impression_to_user = {row["impression_id"]: row["user_id"] 
                          for row in test_sample.select(["impression_id", "user_id"]).unique().iter_rows(named=True)}
    
    # Extract all news IDs that need to be scored
    news_to_score = test_sample.select(["impression_id", "news_id"]).unique()
    
    # Generate predictions
    predictions = []
    for row in news_to_score.iter_rows(named=True):
        impression_id = row["impression_id"]
        news_id = row["news_id"]
        
        # Skip if impression or news not in maps
        if impression_id not in impression_to_user or news_id not in news_map:
            continue
        
        user_id = impression_to_user[impression_id]
        if user_id not in user_map:
            continue
            
        user_idx = user_map[user_id]
        news_idx = news_map[news_id]
        
        # Get user interactions
        user_interactions = interaction_matrix[user_idx].toarray().flatten()
        interacted_indices = np.where(user_interactions > 0)[0]
        
        if len(interacted_indices) == 0:
            continue
            
        # Calculate score by combining collaborative and recency factors
        collab_score = 0.0
        for idx in interacted_indices:
            collab_score += similarity_matrix[idx, news_idx] * user_interactions[idx]
        
        # Add time decay for older news (if applicable)
        # This would require time information for each news article
        
        # Add prediction
        predictions.append({
            "impression_id": impression_id,
            "news_id": news_id,
            "score": float(collab_score)
        })
    
    # Create prediction DataFrame
    if predictions:
        predictions_df = pl.DataFrame(predictions)
        print(f"Generated {len(predictions_df)} predictions in {time.time() - start_time:.2f} seconds")
        
        # Evaluate predictions
        eval_results = eval.evaluate_mind_predictions(
            predictions_df,
            behaviors_df=test_actual_sample,
            metrics=["auc", "mrr", "ndcg@5", "ndcg@10"]
        )
        
        return eval_results
    else:
        print("No predictions generated")
        return {}

In [5]:
news_df = news_train.to_pandas()
behaviors_df = behavior_polars_train.to_pandas()


In [8]:
interactions_df = preprocess_behaviors(behavior_polars_train)

In [9]:
test_interactions = preprocess_behaviors(behavior_polars_dev)

In [10]:
train_interactions, test_interactions = interactions_df, test_interactions

In [12]:
# 4. Create interaction matrix from training data
interaction_matrix, user_map, news_map = create_interaction_matrix(train_interactions.to_pandas())

In [13]:
# 5. Compute news-news similarity
item_similarity = compute_item_similarity(interaction_matrix)

In [36]:
import numpy as np
import polars as pl
import random
import time

# Start timing
start_time = time.time()

# Sample 1000 users who exist in both test and training data
test_users = set(test_interactions["user_id"].unique().to_list())
train_users = set(user_map.keys())
valid_users = list(test_users.intersection(train_users))

print(f"Found {len(valid_users)} users that exist in both training and test data")

# Sample 1000 users or fewer if there aren't that many
sample_size = min(3000, len(valid_users))
sampled_users = random.sample(valid_users, sample_size)
print(f"Sampled {len(sampled_users)} users for evaluation")

# Filter test data to only include sampled users
test_sample = test_interactions.filter(pl.col("user_id").is_in(sampled_users))
print(f"Sample contains {len(test_sample)} interactions")

test_actual_sample = behavior_polars_dev.filter(pl.col("user_id").is_in(sampled_users))

# Extract unique user-news pairs to score
unique_pairs = test_sample.select(["impression_id", "user_id", "news_id"]).unique()
print(f"Unique pairs to score: {len(unique_pairs)}")

# Convert to pandas for easier processing
pairs_pd = unique_pairs.to_pandas()

# Initialize predictions list
predictions = []

# Process in batches for better progress tracking
batch_size = 1000
num_batches = (len(pairs_pd) + batch_size - 1) // batch_size

print("Generating predictions...")
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(pairs_pd))
    
    if batch_idx % 10 == 0:
        print(f"Processing batch {batch_idx+1}/{num_batches}")
    
    batch = pairs_pd.iloc[start_idx:end_idx]
    
    for _, row in batch.iterrows():
        user_id = row['user_id']
        news_id = row['news_id']
        impression_id = row['impression_id']
        
        # Skip if news not in training
        if news_id not in news_map:
            continue
            
        # Get user profile
        user_idx = user_map[user_id]
        news_idx = news_map[news_id]
        
        # Get user interactions
        user_interactions = interaction_matrix[user_idx].toarray().flatten()
        interacted_indices = np.where(user_interactions > 0)[0]
        
        if len(interacted_indices) == 0:
            continue
            
        # Calculate score
        score = 0.0
        for idx in interacted_indices:
            if idx < item_similarity.shape[0] and news_idx < item_similarity.shape[1]:
                score += item_similarity[idx, news_idx] * user_interactions[idx]
        
        # Add prediction
        predictions.append({
            "impression_id": impression_id,
            "news_id": news_id,
            "score": float(score)
        })

Found 216778 users that exist in both training and test data
Sampled 3000 users for evaluation
Sample contains 167173 interactions
Unique pairs to score: 167173
Generating predictions...
Processing batch 1/168
Processing batch 11/168
Processing batch 21/168
Processing batch 31/168
Processing batch 41/168
Processing batch 51/168
Processing batch 61/168
Processing batch 71/168
Processing batch 81/168
Processing batch 91/168
Processing batch 101/168
Processing batch 111/168
Processing batch 121/168
Processing batch 131/168
Processing batch 141/168
Processing batch 151/168
Processing batch 161/168


In [37]:
predictions_df = pd.DataFrame(predictions)

eval_results = eval.evaluate_mind_predictions(
    predictions_df,
    behaviors_df=test_actual_sample,  # Only use the sampled test data
    metrics=["auc", "mrr", "ndcg@5", "ndcg@10"]
)

pd.DataFrame(eval_results, index=[0])

Unnamed: 0,auc,mrr,ndcg@5,ndcg@10
0,0.509739,0.232029,0.240954,0.28879


In [41]:
hybrid_similarity = create_hybrid_similarity_matrix(interaction_matrix, news_df, news_map)



In [43]:
# Evaluate with the enhanced approach
results = sample_and_evaluate(
    test_interactions, 
    interaction_matrix, 
    hybrid_similarity,
    user_map, 
    news_map, 
    news_df,
    behaviors_df,
    sample_size=1000
)


Generated 39100 predictions in 2.28 seconds


In [45]:
pd.DataFrame(results, index=[0])

Unnamed: 0,auc,mrr,ndcg@5,ndcg@10
0,0.581852,0.281212,0.293306,0.345894
