In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Loading the datasets

In [21]:
news_data = pd.read_csv("data/MINDlarge_train/news.tsv", header=None, sep='\t')
news_data.columns = ['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

news_data.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [22]:
behavior_data = pd.read_csv("data/MINDlarge_train/behaviors.tsv", header=None, sep='\t')
behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


## Now we have to create the user-item matrix

In [23]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # Optional for progress tracking

def process_interactions_efficiently(behavior_data, batch_size=10000):
    """
    Process behavior data to extract user-item interactions efficiently using batching.
    """
    total_batches = (len(behavior_data) + batch_size - 1) // batch_size
    interaction_dfs = []
    
    for batch_num in range(total_batches):
        # Get a batch of the data
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(behavior_data))
        batch = behavior_data.iloc[start_idx:end_idx]
        
        # Filter rows with valid history
        valid_rows = batch[batch['history'].notna() & (batch['history'].str.strip() != '')]
        
        if len(valid_rows) > 0:
            # Apply vectorized operations
            temp_df = valid_rows[['user_id', 'history']].copy()
            temp_df['article_id'] = temp_df['history'].str.split()
            # Explode to create one row per user-article interaction
            temp_df = temp_df.explode('article_id')
            # Keep only the columns we need
            temp_df = temp_df[['user_id', 'article_id']]
            
            interaction_dfs.append(temp_df)
    
    # Combine all batches into final dataframe
    if interaction_dfs:
        interactions_df = pd.concat(interaction_dfs, ignore_index=True)
    else:
        interactions_df = pd.DataFrame(columns=['user_id', 'article_id'])
        
    return interactions_df

def create_sparse_matrices(interactions_df):
    """
    Create sparse user-item matrix and compute sparse item similarity matrix.
    Returns both matrices and mapping dictionaries.
    """
    # Create mappings from IDs to indices
    user_ids = interactions_df['user_id'].unique()
    article_ids = interactions_df['article_id'].unique()
    
    user_id_to_idx = {id: i for i, id in enumerate(user_ids)}
    article_id_to_idx = {id: i for i, id in enumerate(article_ids)}
    
    # Map the original IDs to matrix indices
    user_indices = interactions_df['user_id'].map(user_id_to_idx).values
    article_indices = interactions_df['article_id'].map(article_id_to_idx).values
    
    # Create interaction values (all 1s for implicit feedback)
    interaction_values = np.ones(len(interactions_df), dtype=np.float32)
    
    # Create the sparse user-item matrix
    sparse_user_item = csr_matrix(
        (interaction_values, (user_indices, article_indices)),
        shape=(len(user_ids), len(article_ids))
    )
    
    # Create item-item similarity matrix (cosine similarity between items)
    print("Computing item similarity matrix (this might take a while)...")
    sparse_item_similarity = cosine_similarity(sparse_user_item.T, dense_output=False)
    
    # Create reverse mappings to convert back to original IDs
    idx_to_user_id = {i: id for id, i in user_id_to_idx.items()}
    idx_to_article_id = {i: id for id, i in article_id_to_idx.items()}
    
    return (sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, 
            idx_to_user_id, idx_to_article_id)

def get_item_recommendations_sparse(user_id, 
                                  sparse_user_item, 
                                  sparse_item_similarity,
                                  user_id_to_idx, 
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  top_n=5):
    """
    Generate top-n item recommendations for a given user using sparse matrices.
    """
    # Convert user_id to matrix index
    if user_id not in user_id_to_idx:
        return []
    
    user_idx = user_id_to_idx[user_id]
    
    # Get items the user has interacted with
    user_interactions = sparse_user_item[user_idx].toarray().flatten()
    interacted_item_indices = np.where(user_interactions > 0)[0]
    
    if len(interacted_item_indices) == 0:
        return []  # User has no interactions, cannot recommend
    
    # Initialize scores array for all items
    scores = np.zeros(sparse_item_similarity.shape[0])
    
    # For each item the user has interacted with
    for item_idx in interacted_item_indices:
        # Get similarity scores for this item with all other items
        similarity_scores = sparse_item_similarity[item_idx].toarray().flatten()
        # Add to accumulated scores
        scores += similarity_scores
    
    # Set scores of items the user has already interacted with to -1 (to exclude them)
    scores[interacted_item_indices] = -1
    
    # Get indices of top_n items with highest scores
    recommended_indices = np.argsort(scores)[::-1][:top_n]
    
    # Convert indices back to article IDs
    recommended_articles = [idx_to_article_id[idx] for idx in recommended_indices if scores[idx] > 0]
    
    return recommended_articles

def batch_generate_recommendations(user_ids, 
                                  sparse_user_item, 
                                  sparse_item_similarity,
                                  user_id_to_idx, 
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  top_n=5):
    """
    Generate recommendations for multiple users efficiently.
    """
    recommendations = {}
    
    for user_id in tqdm(user_ids, desc="Generating recommendations"):
        recs = get_item_recommendations_sparse(
            user_id, 
            sparse_user_item, 
            sparse_item_similarity,
            user_id_to_idx, 
            article_id_to_idx,
            idx_to_article_id,
            top_n=top_n
        )
        recommendations[user_id] = recs
        
    return recommendations


def collaborative_filtering_pipeline(behavior_data, top_n=5, sample_users=None):
    """
    Complete pipeline for collaborative filtering.
    
    Parameters:
    -----------
    behavior_data : DataFrame
        DataFrame containing user_id and history columns
    top_n : int
        Number of recommendations per user
    sample_users : list or None
        List of specific user_ids to generate recommendations for,
        or None to use all users
        
    Returns:
    --------
    tuple: (recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id)
    """
    print("Processing interactions...")
    interactions_df = process_interactions_efficiently(behavior_data)
    
    print("Creating sparse matrices...")
    (sparse_user_item, sparse_item_similarity, 
     user_id_to_idx, article_id_to_idx, 
     idx_to_user_id, idx_to_article_id) = create_sparse_matrices(interactions_df)
    
    print(f"User-item matrix shape: {sparse_user_item.shape}")
    print(f"Density: {sparse_user_item.nnz / (sparse_user_item.shape[0] * sparse_user_item.shape[1]):.6f}")
    
    if sample_users is None:
        # Use all users (or first 100 for demonstration)
        sample_users = list(user_id_to_idx.keys())[:500]  # Limit for demonstration
    
    print(f"Generating recommendations for {len(sample_users)} users...")
    recommendations = batch_generate_recommendations(
        sample_users,
        sparse_user_item, 
        sparse_item_similarity,
        user_id_to_idx, 
        article_id_to_idx,
        idx_to_article_id,
        top_n=top_n
    )
    
    # Return all necessary variables for evaluation
    return (recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id)


## Evaluating

In [24]:
test_behavior_data = pd.read_csv("data/MINDlarge_dev/behaviors.tsv", header=None, sep='\t')
test_behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

test_behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U134050,11/15/2019 8:55:22 AM,N12246 N128820 N119226 N4065 N67770 N33446 N10...,N91737-0 N30206-0 N54368-0 N117802-0 N18190-0 ...
1,2,U254959,11/15/2019 11:42:35 AM,N34011 N9375 N67397 N7936 N118985 N109453 N103...,N119999-0 N24958-0 N104054-0 N33901-0 N9250-0 ...
2,3,U499841,11/15/2019 9:08:21 AM,N63858 N26834 N6379 N85484 N15229 N65119 N1047...,N18190-0 N89764-0 N91737-0 N54368-0 N49978-1 N...
3,4,U107107,11/15/2019 5:50:31 AM,N12959 N8085 N18389 N3758 N9740 N90543 N129790...,N122944-1 N18190-0 N55801-0 N59297-0 N128045-0...
4,5,U492344,11/15/2019 5:02:25 AM,N109183 N48453 N85005 N45706 N98923 N46069 N35...,N64785-0 N82503-0 N32993-0 N122944-0 N29160-0 ...


In [25]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def generate_mind_predictions(test_behavior_data, 
                                         sparse_user_item, 
                                         sparse_item_similarity,
                                         user_id_to_idx, 
                                         article_id_to_idx,
                                         idx_to_article_id):
    """
    Generate prediction scores with compatible datatypes for evaluation.
    """
    predictions = []
    
    for idx, row in tqdm(test_behavior_data.iterrows(), total=len(test_behavior_data)):
        # Get impression_id (use row index if not in columns)
        impression_id = row.get('impression_id', idx)
        user_id = row['user_id']
        
        if pd.isna(row['impressions']):
            continue
            
        # Parse impression articles
        for imp in row['impressions'].split():
            parts = imp.split('-')
            article_id = parts[0]
            
            # Calculate score (using default 0.5 for simplicity in this example)
            score = 0.5
            
            # If user and article exist in our model, calculate real score
            if user_id in user_id_to_idx and article_id in article_id_to_idx:
                user_idx = user_id_to_idx[user_id]
                article_idx = article_id_to_idx[article_id]
                
                # Get user's interaction history
                user_interactions = sparse_user_item[user_idx].toarray().flatten()
                interacted_item_indices = np.where(user_interactions > 0)[0]
                
                if len(interacted_item_indices) > 0:
                    # Calculate similarity-based score
                    total_sim = 0
                    
                    for item_idx in interacted_item_indices:
                        # Get similarity safely
                        sim_matrix = sparse_item_similarity[item_idx].toarray().flatten()
                        if article_idx < len(sim_matrix):
                            total_sim += sim_matrix[article_idx]
                    
                    # Normalize to 0-1 range (simple approach)
                    # This is a simplified normalization - adjust if needed
                    score = min(1.0, max(0.0, total_sim / max(1, len(interacted_item_indices))))
            
            # Append to predictions list
            predictions.append({
                'impression_id': impression_id,
                'news_id': article_id,
                'score': float(score)  # Ensure float
            })
    
    # Create DataFrame with explicit dtypes
    predictions_df = pd.DataFrame(predictions)
    
    # Ensure correct datatypes for Polars conversion
    predictions_df['impression_id'] = predictions_df['impression_id'].astype(str)
    predictions_df['news_id'] = predictions_df['news_id'].astype(str)
    predictions_df['score'] = predictions_df['score'].astype(float)
    
    
    
    return predictions_df

# Example usage:
# This code should be placed after running the collaborative_filtering_pipeline

(recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id) = collaborative_filtering_pipeline(behavior_data, top_n=5)

sample_size = 30000  # Adjust this based on your needs
test_sample = test_behavior_data.sample(n=sample_size, random_state=42)

# Generate predictions for test data
predictions_df = generate_mind_predictions(
    test_sample,
    sparse_user_item, 
    sparse_item_similarity,
    user_id_to_idx, 
    article_id_to_idx,
    idx_to_article_id
)

# You can then save predictions to a CSV file for submission or evaluation
predictions_df.to_csv('mind_predictions.csv', index=False)

print(f"Generated {len(predictions_df)} predictions")
predictions_df.head()

Processing interactions...
Creating sparse matrices...
Computing item similarity matrix (this might take a while)...
User-item matrix shape: (698365, 79546)
Density: 0.000237
Generating recommendations for 500 users...


Generating recommendations: 100%|██████████| 500/500 [00:04<00:00, 106.56it/s]
100%|██████████| 30000/30000 [03:27<00:00, 144.57it/s]


Generated 1128901 predictions


Unnamed: 0,impression_id,news_id,score
0,181577,N83707,0.5
1,181577,N26122,0.5
2,181577,N32993,0.5
3,181577,N80770,0.5
4,181577,N86609,0.5


In [26]:
predictions_df['impression_id'] = predictions_df['impression_id'].astype('int64')  # i64
predictions_df['news_id'] = predictions_df['news_id'].astype(str)                 # str
predictions_df['score'] = predictions_df['score'].astype('float64')  

In [27]:
predictions_df.head()

Unnamed: 0,impression_id,news_id,score
0,181577,N83707,0.5
1,181577,N26122,0.5
2,181577,N32993,0.5
3,181577,N80770,0.5
4,181577,N86609,0.5


In [28]:
import lib.eval as eval
import importlib

importlib.reload(eval)

eval.evaluate_mind_predictions(predictions_df, test_behavior_data)

{'auc': np.float64(0.5122522263430606),
 'mrr': np.float64(0.2454378407400277),
 'ndcg@5': np.float64(0.24792856661957594),
 'ndcg@10': np.float64(0.31242379386645075)}

In [29]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, diags
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def process_interactions_efficiently(behavior_data, batch_size=10000):
    """
    Process behavior data to extract user-item interactions efficiently using batching.
    Added weighting based on position in history (recency).
    """
    total_batches = (len(behavior_data) + batch_size - 1) // batch_size
    interaction_dfs = []
    
    for batch_num in range(total_batches):
        # Get a batch of the data
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(behavior_data))
        batch = behavior_data.iloc[start_idx:end_idx]
        
        # Filter rows with valid history
        valid_rows = batch[batch['history'].notna() & (batch['history'].str.strip() != '')]
        
        if len(valid_rows) > 0:
            # Apply vectorized operations
            temp_df = valid_rows[['user_id', 'history']].copy()
            
            # Process each user's history to add position weights
            weighted_interactions = []
            
            for _, row in temp_df.iterrows():
                user_id = row['user_id']
                articles = row['history'].split()
                
                # Calculate position-based weights (more recent = higher weight)
                # This captures the temporal aspect of news consumption
                if len(articles) > 0:
                    # Exponential weighting for stronger recency bias
                    weights = np.exp(np.linspace(0, 1, len(articles))) / np.exp(1)
                    
                    for i, article_id in enumerate(articles):
                        weighted_interactions.append({
                            'user_id': user_id,
                            'article_id': article_id,
                            'weight': weights[i]
                        })
            
            if weighted_interactions:
                interaction_df = pd.DataFrame(weighted_interactions)
                interaction_dfs.append(interaction_df)
    
    # Combine all batches into final dataframe
    if interaction_dfs:
        interactions_df = pd.concat(interaction_dfs, ignore_index=True)
        
        # Aggregate weights for duplicate user-item pairs (user may have clicked same article multiple times)
        interactions_df = interactions_df.groupby(['user_id', 'article_id'])['weight'].sum().reset_index()
    else:
        interactions_df = pd.DataFrame(columns=['user_id', 'article_id', 'weight'])
        
    return interactions_df

def create_sparse_matrices(interactions_df):
    """
    Create sparse user-item matrix and compute improved item similarity matrix.
    Uses TF-IDF weighting and normalization to improve recommendation quality.
    """
    # Create mappings from IDs to indices
    user_ids = interactions_df['user_id'].unique()
    article_ids = interactions_df['article_id'].unique()
    
    user_id_to_idx = {id: i for i, id in enumerate(user_ids)}
    article_id_to_idx = {id: i for i, id in enumerate(article_ids)}
    
    # Map the original IDs to matrix indices
    user_indices = interactions_df['user_id'].map(user_id_to_idx).values
    article_indices = interactions_df['article_id'].map(article_id_to_idx).values
    
    # Use weights instead of just 1s for interaction values
    interaction_values = interactions_df['weight'].values
    
    # Create the sparse user-item matrix
    sparse_user_item = csr_matrix(
        (interaction_values, (user_indices, article_indices)),
        shape=(len(user_ids), len(article_ids))
    )
    
    # Apply TF-IDF weighting to reduce popularity bias
    # First, compute IDF (inverse document frequency)
    item_frequencies = np.array((sparse_user_item > 0).sum(axis=0)).flatten()
    idf = np.log(sparse_user_item.shape[0] / (item_frequencies + 1))
    
    # Create a diagonal matrix with IDF values
    idf_diag = diags(idf, 0)
    
    # Apply IDF weighting to user-item matrix
    weighted_user_item = sparse_user_item.dot(idf_diag)
    
    # Normalize user vectors to unit length for better cosine similarity calculation
    user_norm = np.sqrt(weighted_user_item.power(2).sum(axis=1))
    user_norm[user_norm == 0] = 1  # Avoid division by zero
    
    row_diag = diags(1 / user_norm.A.flatten(), 0)
    normalized_user_item = row_diag.dot(weighted_user_item)
    
    # Compute item popularity
    item_popularity = item_frequencies / item_frequencies.sum()
    
    # Create item-item similarity matrix (cosine similarity between items)
    print("Computing item similarity matrix (this might take a while)...")
    sparse_item_similarity = cosine_similarity(normalized_user_item.T, dense_output=False)
    
    # Create reverse mappings to convert back to original IDs
    idx_to_user_id = {i: id for id, i in user_id_to_idx.items()}
    idx_to_article_id = {i: id for id, i in article_id_to_idx.items()}
    
    return (normalized_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, 
            idx_to_user_id, idx_to_article_id,
            item_popularity)

def get_item_recommendations_sparse(user_id, 
                                  sparse_user_item, 
                                  sparse_item_similarity,
                                  user_id_to_idx, 
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  item_popularity,
                                  top_n=5,
                                  diversity_factor=0.2,
                                  popularity_penalty=0.1):
    """
    Generate top-n item recommendations for a given user using sparse matrices.
    Incorporates diversity and reduces popularity bias.
    """
    # Convert user_id to matrix index
    if user_id not in user_id_to_idx:
        return []
    
    user_idx = user_id_to_idx[user_id]
    
    # Get items the user has interacted with
    user_vector = sparse_user_item[user_idx].toarray().flatten()
    interacted_item_indices = np.where(user_vector > 0)[0]
    
    if len(interacted_item_indices) == 0:
        return []  # User has no interactions, cannot recommend
    
    # Initialize scores array for all items
    scores = np.zeros(sparse_item_similarity.shape[0])
    
    # Apply a damping function to reduce popularity bias
    popularity_penalty_factor = np.power(item_popularity, popularity_penalty)
    
    # Keep track of item similarity to already selected items for diversity
    selected_items_similarity = np.zeros(sparse_item_similarity.shape[0])
    
    # For each item the user has interacted with, weighted by interaction strength
    for i, item_idx in enumerate(interacted_item_indices):
        # Get similarity scores for this item with all other items
        similarity_scores = sparse_item_similarity[item_idx].toarray().flatten()
        
        # Weight by the user's interaction strength with this item
        interaction_weight = user_vector[item_idx]
        weighted_scores = similarity_scores * interaction_weight
        
        # Add to accumulated scores
        scores += weighted_scores
    
    # Set scores of items the user has already interacted with to -1 (to exclude them)
    scores[interacted_item_indices] = -1
    
    # Apply penalty for very popular items
    scores = scores * (1 - popularity_penalty_factor)
    
    # Get final scores
    valid_items = np.where(scores > 0)[0]
    
    if len(valid_items) == 0:
        return []
    
    # Get recommendations balancing relevance and diversity
    recommended_articles = []
    remaining_items = valid_items.copy()
    
    for _ in range(min(top_n, len(valid_items))):
        if len(remaining_items) == 0:
            break
            
        # Compute diversity score
        diversity_scores = np.zeros(len(remaining_items))
        
        if len(recommended_articles) > 0:
            for i, item_idx in enumerate(remaining_items):
                # Average similarity to already selected items
                sim_to_selected = np.mean([
                    sparse_item_similarity[item_idx, rec_idx].toarray()[0, 0]
                    for rec_idx in recommended_articles
                ])
                # Lower similarity means more diversity
                diversity_scores[i] = 1 - sim_to_selected
        
        # Get scores for remaining items
        item_scores = np.array([scores[idx] for idx in remaining_items])
        
        # Combine relevance and diversity
        if len(recommended_articles) > 0:
            combined_scores = (1 - diversity_factor) * item_scores + diversity_factor * diversity_scores
        else:
            combined_scores = item_scores
        
        # Select the best item
        best_idx = np.argmax(combined_scores)
        selected_item = remaining_items[best_idx]
        
        # Add to recommendations
        recommended_articles.append(selected_item)
        
        # Remove from remaining items
        remaining_items = np.delete(remaining_items, best_idx)
    
    # Convert indices back to article IDs
    return [idx_to_article_id[idx] for idx in recommended_articles]

def generate_mind_predictions(test_behavior_data, 
                             sparse_user_item, 
                             sparse_item_similarity,
                             user_id_to_idx, 
                             article_id_to_idx,
                             idx_to_article_id,
                             item_popularity):
    """
    Generate prediction scores for each impression-article pair in the test dataset.
    Optimized for MIND dataset evaluation.
    """
    predictions = []
    
    for idx, row in tqdm(test_behavior_data.iterrows(), 
                        total=len(test_behavior_data), 
                        desc="Generating predictions"):
        
        impression_id = row['impression_id'] if 'impression_id' in row else idx
        user_id = row['user_id']
        
        # Skip if no impressions
        if pd.isna(row['impressions']):
            continue
            
        # Parse impressions to get article IDs
        impression_articles = []
        for imp in row['impressions'].split():
            parts = imp.split('-')
            if len(parts) > 0:
                article_id = parts[0]
                impression_articles.append(article_id)
        
        # Skip if user not in the training data - use a default approach
        if user_id not in user_id_to_idx:
            # Use item popularity as a fallback strategy for cold-start users
            for article_id in impression_articles:
                if article_id in article_id_to_idx:
                    article_idx = article_id_to_idx[article_id]
                    pop_score = item_popularity[article_idx]
                    # Scale popularity to a reasonable range
                    score = min(1.0, pop_score * 100)
                else:
                    score = 0.5  # Default score
                
                predictions.append({
                    'impression_id': impression_id,
                    'news_id': article_id,
                    'score': score
                })
            continue
        
        # Get user index
        user_idx = user_id_to_idx[user_id]
        
        # Get items the user has interacted with
        user_vector = sparse_user_item[user_idx].toarray().flatten()
        interacted_item_indices = np.where(user_vector > 0)[0]
        
        # Skip if user has no interactions - use popularity
        if len(interacted_item_indices) == 0:
            for article_id in impression_articles:
                if article_id in article_id_to_idx:
                    article_idx = article_id_to_idx[article_id]
                    score = item_popularity[article_idx] * 100
                else:
                    score = 0.5
                
                predictions.append({
                    'impression_id': impression_id,
                    'news_id': article_id,
                    'score': score
                })
            continue
        
        # Initialize scores array for all items
        all_scores = np.zeros(sparse_item_similarity.shape[0])
        
        # For each item the user has interacted with
        for item_idx in interacted_item_indices:
            # Weight by the user's interaction strength with this item
            interaction_weight = user_vector[item_idx]
            
            # Get similarity scores for this item with all other items
            similarity_scores = sparse_item_similarity[item_idx].toarray().flatten()
            
            # Apply weighted contribution
            all_scores += similarity_scores * interaction_weight
        
        # Process each impression article
        for article_id in impression_articles:
            if article_id in article_id_to_idx:
                article_idx = article_id_to_idx[article_id]
                raw_score = all_scores[article_idx]
                
                # Apply logarithmic scaling to prevent extremely small/large values
                if raw_score > 0:
                    score = min(1.0, 0.5 + 0.5 * np.log1p(raw_score) / np.log(10))
                else:
                    # If no similarity, use item popularity as a fallback
                    score = 0.5 * item_popularity[article_idx] * 100
            else:
                # Article not in training data
                score = 0.5  # Default score
            
            predictions.append({
                'impression_id': impression_id,
                'news_id': article_id,
                'score': score
            })
    
    # Create DataFrame from predictions
    predictions_df = pd.DataFrame(predictions)
    
    # Ensure correct datatypes
    predictions_df['impression_id'] = predictions_df['impression_id'].astype('int64')
    predictions_df['news_id'] = predictions_df['news_id'].apply(lambda x: f'"{x}"')
    predictions_df['score'] = predictions_df['score'].astype('float64')
    
    return predictions_df

def collaborative_filtering_pipeline(behavior_data, test_behavior_data, top_n=5):
    """
    Complete pipeline for improved collaborative filtering.
    
    Parameters:
    -----------
    behavior_data : DataFrame
        Training data containing user_id and history columns
    test_behavior_data : DataFrame
        Test data for generating predictions
    top_n : int
        Number of recommendations per user
        
    Returns:
    --------
    predictions_df : DataFrame
        DataFrame with impression_id, news_id, and score columns
    """
    print("Processing interactions...")
    interactions_df = process_interactions_efficiently(behavior_data)
    
    print("Creating sparse matrices...")
    (sparse_user_item, sparse_item_similarity, 
     user_id_to_idx, article_id_to_idx, 
     idx_to_user_id, idx_to_article_id,
     item_popularity) = create_sparse_matrices(interactions_df)
    
    print(f"User-item matrix shape: {sparse_user_item.shape}")
    print(f"Density: {sparse_user_item.nnz / (sparse_user_item.shape[0] * sparse_user_item.shape[1]):.6f}")
    
    print("Generating predictions...")
    predictions_df = generate_mind_predictions(
        test_behavior_data,
        sparse_user_item, 
        sparse_item_similarity,
        user_id_to_idx, 
        article_id_to_idx,
        idx_to_article_id,
        item_popularity
    )
    
    return predictions_df

sample_size = 30000  # Adjust this based on your needs
test_sample = test_behavior_data.sample(n=sample_size, random_state=42)
predictions_df = collaborative_filtering_pipeline(behavior_data, test_sample)
predictions_df.to_csv('improved_cf_predictions.csv', index=False)

Processing interactions...
Creating sparse matrices...
Computing item similarity matrix (this might take a while)...
User-item matrix shape: (698365, 79546)
Density: 0.000237
Generating predictions...


Generating predictions: 100%|██████████| 30000/30000 [02:15<00:00, 222.03it/s]


In [33]:
predictions_df = predictions_df.reset_index(drop=True)

In [34]:
import lib.eval as eval
import importlib

importlib.reload(eval)

eval.evaluate_mind_predictions(predictions_df, test_behavior_data)

{'error': 'No matching impression_id and news_id between predictions and ground truth'}

In [32]:
predictions_df.tail()


Unnamed: 0,impression_id,news_id,score
1128896,349492,"""N72977""",0.5
1128897,349492,"""N54368""",0.5
1128898,349492,"""N46894""",0.5
1128899,349492,"""N29160""",0.5
1128900,349492,"""N122944""",0.5
