In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Loading the datasets

In [9]:
news_data = pd.read_csv("data/MINDlarge_train/news.tsv", header=None, sep='\t')
news_data.columns = ['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

news_data.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [10]:
behavior_data = pd.read_csv("data/MINDlarge_train/behaviors.tsv", header=None, sep='\t')
behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


## Now we have to create the user-item matrix

In [11]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # Optional for progress tracking

def process_interactions_efficiently(behavior_data, batch_size=10000):
    """
    Process behavior data to extract user-item interactions efficiently using batching.
    """
    total_batches = (len(behavior_data) + batch_size - 1) // batch_size
    interaction_dfs = []
    
    for batch_num in range(total_batches):
        # Get a batch of the data
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(behavior_data))
        batch = behavior_data.iloc[start_idx:end_idx]
        
        # Filter rows with valid history
        valid_rows = batch[batch['history'].notna() & (batch['history'].str.strip() != '')]
        
        if len(valid_rows) > 0:
            # Apply vectorized operations
            temp_df = valid_rows[['user_id', 'history']].copy()
            temp_df['article_id'] = temp_df['history'].str.split()
            # Explode to create one row per user-article interaction
            temp_df = temp_df.explode('article_id')
            # Keep only the columns we need
            temp_df = temp_df[['user_id', 'article_id']]
            
            interaction_dfs.append(temp_df)
    
    # Combine all batches into final dataframe
    if interaction_dfs:
        interactions_df = pd.concat(interaction_dfs, ignore_index=True)
    else:
        interactions_df = pd.DataFrame(columns=['user_id', 'article_id'])
        
    return interactions_df

def create_sparse_matrices(interactions_df):
    """
    Create sparse user-item matrix and compute sparse item similarity matrix.
    Returns both matrices and mapping dictionaries.
    """
    # Create mappings from IDs to indices
    user_ids = interactions_df['user_id'].unique()
    article_ids = interactions_df['article_id'].unique()
    
    user_id_to_idx = {id: i for i, id in enumerate(user_ids)}
    article_id_to_idx = {id: i for i, id in enumerate(article_ids)}
    
    # Map the original IDs to matrix indices
    user_indices = interactions_df['user_id'].map(user_id_to_idx).values
    article_indices = interactions_df['article_id'].map(article_id_to_idx).values
    
    # Create interaction values (all 1s for implicit feedback)
    interaction_values = np.ones(len(interactions_df), dtype=np.float32)
    
    # Create the sparse user-item matrix
    sparse_user_item = csr_matrix(
        (interaction_values, (user_indices, article_indices)),
        shape=(len(user_ids), len(article_ids))
    )
    
    # Create item-item similarity matrix (cosine similarity between items)
    print("Computing item similarity matrix (this might take a while)...")
    sparse_item_similarity = cosine_similarity(sparse_user_item.T, dense_output=False)
    
    # Create reverse mappings to convert back to original IDs
    idx_to_user_id = {i: id for id, i in user_id_to_idx.items()}
    idx_to_article_id = {i: id for id, i in article_id_to_idx.items()}
    
    return (sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, 
            idx_to_user_id, idx_to_article_id)

def get_item_recommendations_sparse(user_id, 
                                  sparse_user_item, 
                                  sparse_item_similarity,
                                  user_id_to_idx, 
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  top_n=5):
    """
    Generate top-n item recommendations for a given user using sparse matrices.
    """
    # Convert user_id to matrix index
    if user_id not in user_id_to_idx:
        return []
    
    user_idx = user_id_to_idx[user_id]
    
    # Get items the user has interacted with
    user_interactions = sparse_user_item[user_idx].toarray().flatten()
    interacted_item_indices = np.where(user_interactions > 0)[0]
    
    if len(interacted_item_indices) == 0:
        return []  # User has no interactions, cannot recommend
    
    # Initialize scores array for all items
    scores = np.zeros(sparse_item_similarity.shape[0])
    
    # For each item the user has interacted with
    for item_idx in interacted_item_indices:
        # Get similarity scores for this item with all other items
        similarity_scores = sparse_item_similarity[item_idx].toarray().flatten()
        # Add to accumulated scores
        scores += similarity_scores
    
    # Set scores of items the user has already interacted with to -1 (to exclude them)
    scores[interacted_item_indices] = -1
    
    # Get indices of top_n items with highest scores
    recommended_indices = np.argsort(scores)[::-1][:top_n]
    
    # Convert indices back to article IDs
    recommended_articles = [idx_to_article_id[idx] for idx in recommended_indices if scores[idx] > 0]
    
    return recommended_articles

def batch_generate_recommendations(user_ids, 
                                  sparse_user_item, 
                                  sparse_item_similarity,
                                  user_id_to_idx, 
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  top_n=5):
    """
    Generate recommendations for multiple users efficiently.
    """
    recommendations = {}
    
    for user_id in tqdm(user_ids, desc="Generating recommendations"):
        recs = get_item_recommendations_sparse(
            user_id, 
            sparse_user_item, 
            sparse_item_similarity,
            user_id_to_idx, 
            article_id_to_idx,
            idx_to_article_id,
            top_n=top_n
        )
        recommendations[user_id] = recs
        
    return recommendations


def collaborative_filtering_pipeline(behavior_data, top_n=5, sample_users=None):
    """
    Complete pipeline for collaborative filtering.
    
    Parameters:
    -----------
    behavior_data : DataFrame
        DataFrame containing user_id and history columns
    top_n : int
        Number of recommendations per user
    sample_users : list or None
        List of specific user_ids to generate recommendations for,
        or None to use all users
        
    Returns:
    --------
    tuple: (recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id)
    """
    print("Processing interactions...")
    interactions_df = process_interactions_efficiently(behavior_data)
    
    print("Creating sparse matrices...")
    (sparse_user_item, sparse_item_similarity, 
     user_id_to_idx, article_id_to_idx, 
     idx_to_user_id, idx_to_article_id) = create_sparse_matrices(interactions_df)
    
    print(f"User-item matrix shape: {sparse_user_item.shape}")
    print(f"Density: {sparse_user_item.nnz / (sparse_user_item.shape[0] * sparse_user_item.shape[1]):.6f}")
    
    if sample_users is None:
        # Use all users (or first 100 for demonstration)
        sample_users = list(user_id_to_idx.keys())[:100]  # Limit for demonstration
    
    print(f"Generating recommendations for {len(sample_users)} users...")
    recommendations = batch_generate_recommendations(
        sample_users,
        sparse_user_item, 
        sparse_item_similarity,
        user_id_to_idx, 
        article_id_to_idx,
        idx_to_article_id,
        top_n=top_n
    )
    
    # Return all necessary variables for evaluation
    return (recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id)


## Evaluating

In [12]:
test_behavior_data = pd.read_csv("data/MINDlarge_dev/behaviors.tsv", header=None, sep='\t')
test_behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

test_behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U134050,11/15/2019 8:55:22 AM,N12246 N128820 N119226 N4065 N67770 N33446 N10...,N91737-0 N30206-0 N54368-0 N117802-0 N18190-0 ...
1,2,U254959,11/15/2019 11:42:35 AM,N34011 N9375 N67397 N7936 N118985 N109453 N103...,N119999-0 N24958-0 N104054-0 N33901-0 N9250-0 ...
2,3,U499841,11/15/2019 9:08:21 AM,N63858 N26834 N6379 N85484 N15229 N65119 N1047...,N18190-0 N89764-0 N91737-0 N54368-0 N49978-1 N...
3,4,U107107,11/15/2019 5:50:31 AM,N12959 N8085 N18389 N3758 N9740 N90543 N129790...,N122944-1 N18190-0 N55801-0 N59297-0 N128045-0...
4,5,U492344,11/15/2019 5:02:25 AM,N109183 N48453 N85005 N45706 N98923 N46069 N35...,N64785-0 N82503-0 N32993-0 N122944-0 N29160-0 ...


In [15]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def generate_mind_predictions(test_behavior_data, 
                              sparse_user_item, 
                              sparse_item_similarity,
                              user_id_to_idx, 
                              article_id_to_idx,
                              idx_to_article_id,
                              default_score=0.5):
    """
    Generate prediction scores with improved debugging and robustness.
    """
    predictions = []
    matches_found = 0
    default_used = 0
    
    for idx, row in tqdm(test_behavior_data.iterrows(), total=len(test_behavior_data)):
        impression_id = row['impression_id'] if 'impression_id' in row else idx
        user_id = row['user_id']
        
        if pd.isna(row['impressions']):
            continue
            
        # Parse impression articles
        impression_articles = []
        for imp in row['impressions'].split():
            parts = imp.split('-')
            article_id = parts[0]
            impression_articles.append(article_id)
        
        # Check if this user exists in training data
        if user_id not in user_id_to_idx:
            # Default scores for unknown user
            for article_id in impression_articles:
                predictions.append({
                    'impression_id': impression_id,
                    'news_id': article_id,
                    'score': default_score
                })
            default_used += len(impression_articles)
            continue
        
        # Get user interactions
        user_idx = user_id_to_idx[user_id]
        user_interactions = sparse_user_item[user_idx].toarray().flatten()
        interacted_item_indices = np.where(user_interactions > 0)[0]
        
        if len(interacted_item_indices) == 0:
            # Default scores for users with no interactions
            for article_id in impression_articles:
                predictions.append({
                    'impression_id': impression_id,
                    'news_id': article_id,
                    'score': default_score
                })
            default_used += len(impression_articles)
            continue
        
        # Calculate raw similarity scores
        raw_scores = {}
        valid_scores_found = False
        
        for article_id in impression_articles:
            if article_id in article_id_to_idx:
                article_idx = article_id_to_idx[article_id]
                score = 0
                
                # Sum similarities from user's history
                for item_idx in interacted_item_indices:
                    sim = sparse_item_similarity[item_idx, article_idx]
                    score += sim if sim else 0
                
                raw_scores[article_id] = score
                if score > 0:
                    valid_scores_found = True
            else:
                raw_scores[article_id] = 0
        
        # Normalize scores only if we found valid similarities
        if valid_scores_found:
            # Find min and max for normalization
            scores_list = list(raw_scores.values())
            min_score = min(scores_list)
            max_score = max(scores_list)
            
            if max_score > min_score:
                # Normalize
                for article_id in impression_articles:
                    norm_score = (raw_scores[article_id] - min_score) / (max_score - min_score)
                    predictions.append({
                        'impression_id': impression_id,
                        'news_id': article_id,
                        'score': norm_score
                    })
                matches_found += len(impression_articles)
            else:
                # All scores identical - use defaults
                for article_id in impression_articles:
                    predictions.append({
                        'impression_id': impression_id,
                        'news_id': article_id,
                        'score': default_score
                    })
                default_used += len(impression_articles)
        else:
            # No valid scores - use defaults
            for article_id in impression_articles:
                predictions.append({
                    'impression_id': impression_id,
                    'news_id': article_id,
                    'score': default_score
                })
            default_used += len(impression_articles)
    
    predictions_df = pd.DataFrame(predictions)
    
    # Print stats
    print(f"Total predictions: {len(predictions_df)}")
    print(f"Matches found: {matches_found} ({matches_found/len(predictions_df)*100:.1f}%)")
    print(f"Default scores used: {default_used} ({default_used/len(predictions_df)*100:.1f}%)")
    
    return predictions_df

# Example usage:
# This code should be placed after running the collaborative_filtering_pipeline

(recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id) = collaborative_filtering_pipeline(behavior_data, top_n=5)

sample_size = 10000  # Adjust this based on your needs
test_sample = test_behavior_data.sample(n=sample_size, random_state=42)

# Generate predictions for test data
predictions_df = generate_mind_predictions(
    test_sample,
    sparse_user_item, 
    sparse_item_similarity,
    user_id_to_idx, 
    article_id_to_idx,
    idx_to_article_id
)

# You can then save predictions to a CSV file for submission or evaluation
predictions_df.to_csv('mind_predictions.csv', index=False)

print(f"Generated {len(predictions_df)} predictions")
predictions_df.head()

Processing interactions...
Creating sparse matrices...
Computing item similarity matrix (this might take a while)...
User-item matrix shape: (698365, 79546)
Density: 0.000237
Generating recommendations for 100 users...


Generating recommendations: 100%|██████████| 100/100 [00:00<00:00, 111.92it/s]
100%|██████████| 10000/10000 [00:18<00:00, 544.20it/s]


Total predictions: 375460
Matches found: 298856 (79.6%)
Default scores used: 76604 (20.4%)
Generated 375460 predictions


Unnamed: 0,impression_id,news_id,score
0,181577,N83707,0.5
1,181577,N26122,0.5
2,181577,N32993,0.5
3,181577,N80770,0.5
4,181577,N86609,0.5
