In [11]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import time

In [12]:
news_data = pd.read_csv("data/MINDlarge_train/news.tsv", header=None, sep='\t')
news_data.columns = ['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

news_data.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [13]:
behavior_data = pd.read_csv("data/MINDlarge_train/behaviors.tsv", header=None, sep='\t')
behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


In [None]:
test_behavior_data = pd.read_csv("data/MINDlarge_dev/behaviors.tsv", header=None, sep='\t')
test_behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']


: 

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import time

def process_interactions(behavior_data, use_timestamps=True):
    """
    Process behavior data to extract user-item interactions with time weighting.
    
    Args:
        behavior_data: DataFrame with user_id, history, and optionally timestamp columns
        use_timestamps: Whether to use timestamp information for weighting
    
    Returns:
        DataFrame with user_id, article_id, and weight columns
    """
    print("Processing interactions...")
    start_time = time.time()
    
    # Initialize list to store interactions
    interactions = []
    
    # Check if timestamp column exists
    has_timestamp = 'timestamp' in behavior_data.columns and use_timestamps
    
    if has_timestamp:
        # Convert timestamp to datetime if not already
        if not pd.api.types.is_datetime64_any_dtype(behavior_data['timestamp']):
            behavior_data['timestamp'] = pd.to_datetime(behavior_data['timestamp'], errors='coerce')
        
        # Get reference timestamp (max timestamp in the dataset)
        reference_time = behavior_data['timestamp'].max()
    
    # Process each row
    for idx, row in tqdm(behavior_data.iterrows(), total=len(behavior_data), desc="Extracting interactions"):
        user_id = row['user_id']
        
        if pd.isna(row['history']) or row['history'].strip() == "":
            continue
        
        history = row['history'].split()
        
        # Apply time decay if timestamps available
        time_weight = 1.0
        if has_timestamp and pd.notna(row['timestamp']):
            # Calculate days from reference time
            days_diff = (reference_time - row['timestamp']).total_seconds() / (24 * 3600)
            # Apply exponential decay (adjust the 0.1 factor to control decay rate)
            time_weight = np.exp(-0.1 * days_diff)
        
        # Process each article in user history
        for position, article_id in enumerate(history):
            # Position weight (more recent items get higher weight)
            position_weight = 1.0 - 0.8 * (len(history) - 1 - position) / max(1, len(history) - 1)
            
            # Combine weights
            weight = position_weight * time_weight
            
            interactions.append({
                'user_id': user_id,
                'article_id': article_id,
                'weight': weight
            })
    
    # Create DataFrame
    interactions_df = pd.DataFrame(interactions)
    
    # Aggregate weights for duplicate user-item pairs
    if not interactions_df.empty:
        interactions_df = interactions_df.groupby(['user_id', 'article_id'])['weight'].sum().reset_index()
    
    end_time = time.time()
    print(f"Processed {len(interactions_df)} interactions in {end_time - start_time:.2f} seconds")
    
    return interactions_df

def create_matrices(interactions_df, max_items=None):
    """
    Create user-item and item-item matrices for collaborative filtering.
    
    Args:
        interactions_df: DataFrame with user_id, article_id, and weight columns
        max_items: Maximum number of items to keep (most popular)
    
    Returns:
        Tuple of matrices and mappings
    """
    print("Creating matrices...")
    start_time = time.time()
    
    # Limit to most popular items if needed
    if max_items and interactions_df['article_id'].nunique() > max_items:
        # Get item popularity
        item_popularity = interactions_df.groupby('article_id')['weight'].sum().reset_index()
        # Sort by weight and keep top items
        top_items = item_popularity.sort_values('weight', ascending=False).head(max_items)['article_id'].values
        # Filter interactions
        interactions_df = interactions_df[interactions_df['article_id'].isin(top_items)]
        print(f"Filtered to top {max_items} items")
    
    # Create mappings from IDs to indices
    user_ids = interactions_df['user_id'].unique()
    article_ids = interactions_df['article_id'].unique()
    
    user_id_to_idx = {id: i for i, id in enumerate(user_ids)}
    article_id_to_idx = {id: i for i, id in enumerate(article_ids)}
    
    # Map IDs to indices
    user_indices = interactions_df['user_id'].map(user_id_to_idx).values
    article_indices = interactions_df['article_id'].map(article_id_to_idx).values
    
    # Use weights from interactions
    interaction_values = interactions_df['weight'].values
    
    # Create the sparse user-item matrix
    user_item_matrix = csr_matrix(
        (interaction_values, (user_indices, article_indices)),
        shape=(len(user_ids), len(article_ids))
    )
    
    # Calculate item popularity (for cold-start cases)
    item_popularity = np.array(user_item_matrix.sum(axis=0)).flatten()
    item_popularity = item_popularity / np.sum(item_popularity)
    
    # Create reverse mappings
    idx_to_user_id = {i: id for id, i in user_id_to_idx.items()}
    idx_to_article_id = {i: id for id, i in article_id_to_idx.items()}
    
    end_time = time.time()
    print(f"Matrix creation completed in {end_time - start_time:.2f} seconds")
    
    return (user_item_matrix, 
            user_id_to_idx, article_id_to_idx, 
            idx_to_user_id, idx_to_article_id,
            item_popularity)

def create_item_similarity_matrix(user_item_matrix, batch_size=1000):
    """
    Create item similarity matrix using normalized cosine similarity.
    Processes in batches for memory efficiency.
    
    Args:
        user_item_matrix: Sparse user-item matrix
        batch_size: Size of batches to process
    
    Returns:
        Item similarity matrix (sparse)
    """
    print("Computing item similarity matrix...")
    start_time = time.time()
    
    # Transpose to get item-user matrix
    item_user_matrix = user_item_matrix.T.tocsr()
    
    # Normalize the item vectors for better similarity
    normalized_item_matrix = normalize(item_user_matrix, norm='l2', axis=1)
    
    # Get dimensions
    n_items = normalized_item_matrix.shape[0]
    
    # Initialize similarity matrix
    item_similarities = csr_matrix((n_items, n_items))
    
    # Process in batches
    for start_idx in range(0, n_items, batch_size):
        end_idx = min(start_idx + batch_size, n_items)
        current_batch = normalized_item_matrix[start_idx:end_idx]
        
        # Compute similarity between current batch and all items
        batch_similarity = cosine_similarity(
            current_batch, 
            normalized_item_matrix,
            dense_output=False
        )
        
        # Set self-similarity to zero (important for recommendation logic)
        for i in range(end_idx - start_idx):
            batch_similarity[i, start_idx + i] = 0
        
        # Add to similarity matrix
        item_similarities = item_similarities + csr_matrix(
            (batch_similarity.data, 
             (batch_similarity.row + start_idx, batch_similarity.col)),
            shape=(n_items, n_items)
        )
        
        print(f"Processed batch {start_idx//batch_size + 1}/{(n_items + batch_size - 1)//batch_size}")
    
    end_time = time.time()
    print(f"Similarity matrix computation completed in {end_time - start_time:.2f} seconds")
    
    return item_similarities

def generate_predictions(test_behavior_data,
                        user_item_matrix,
                        item_similarities,
                        user_id_to_idx,
                        article_id_to_idx,
                        idx_to_article_id,
                        item_popularity):
    """
    Generate predictions optimized for MIND evaluation metrics.
    
    Args:
        test_behavior_data: DataFrame containing test data
        user_item_matrix: Sparse user-item matrix
        item_similarities: Sparse item-item similarity matrix
        user_id_to_idx: Mapping from user IDs to matrix indices
        article_id_to_idx: Mapping from article IDs to matrix indices
        idx_to_article_id: Mapping from matrix indices to article IDs
        item_popularity: Array of item popularity scores
    
    Returns:
        DataFrame with impression_id, news_id, and score columns
    """
    print("Generating predictions...")
    start_time = time.time()
    
    predictions = []
    
    # Process each row in test data
    for idx, row in tqdm(test_behavior_data.iterrows(), 
                        total=len(test_behavior_data), 
                        desc="Processing test data"):
        impression_id = row.get('impression_id', idx)
        user_id = row['user_id']
        
        if pd.isna(row['impressions']):
            continue
        
        # Parse impression articles
        impression_articles = []
        for imp in row['impressions'].split():
            parts = imp.split('-')
            article_id = parts[0]
            impression_articles.append(article_id)
        
        # Get user's history
        user_history = []
        if pd.notna(row['history']) and row['history'].strip() != "":
            user_history = row['history'].split()
        
        # Generate scores for each impression article
        for article_id in impression_articles:
            # Default score
            score = 0.5
            
            if article_id in article_id_to_idx:
                article_idx = article_id_to_idx[article_id]
                
                # Use different strategies based on whether we know this user
                if user_id in user_id_to_idx:
                    user_idx = user_id_to_idx[user_id]
                    
                    # Personalized recommendation approach
                    # 1. Direct user-item interaction term
                    if user_item_matrix[user_idx, article_idx] > 0:
                        # User has already interacted with this article
                        score = 0.3  # Lower score for already read articles
                    else:
                        # Calculate similarity-based score
                        # Convert user history to indices
                        history_indices = []
                        for hist_id in user_history:
                            if hist_id in article_id_to_idx:
                                history_indices.append(article_id_to_idx[hist_id])
                        
                        if history_indices:
                            # Calculate total similarity to history articles
                            sim_scores = []
                            
                            for hist_idx in history_indices:
                                # Get similarity between history item and candidate
                                sim = item_similarities[hist_idx, article_idx]
                                if sim > 0:
                                    sim_scores.append(sim)
                            
                            if sim_scores:
                                # Use mean of top similarities for score
                                # Using top similarities helps focus on most relevant patterns
                                top_sims = sorted(sim_scores, reverse=True)[:3]
                                cf_score = np.mean(top_sims)
                                
                                # Add popularity component
                                pop_score = item_popularity[article_idx]
                                
                                # Combine (higher weight to CF score)
                                score = 0.8 * cf_score + 0.2 * pop_score
                            else:
                                # Fall back to popularity-based score
                                score = 0.3 + 0.5 * item_popularity[article_idx]
                        else:
                            # No matching history - use popularity
                            score = 0.3 + 0.5 * item_popularity[article_idx]
                else:
                    # Cold-start user - use popularity with a dampening factor
                    # This prevents overweighting very popular items
                    popularity = item_popularity[article_idx]
                    score = 0.3 + 0.7 * (popularity ** 0.5)
            
            # Apply sigmoid transformation to sharpen the scoring distribution
            # This helps improve AUC by creating more separation between scores
            score = 1.0 / (1.0 + np.exp(-5 * (score - 0.5)))
            
            # Add prediction
            predictions.append({
                'impression_id': impression_id,
                'news_id': article_id,
                'score': float(score)
            })
    
    # Create DataFrame
    predictions_df = pd.DataFrame(predictions)
    
    # Convert datatypes for evaluation
    try:
        predictions_df['impression_id'] = predictions_df['impression_id'].astype('int64')
    except:
        predictions_df['impression_id'] = predictions_df['impression_id'].astype(str)
    
    predictions_df['news_id'] = predictions_df['news_id'].astype(str)
    predictions_df['score'] = predictions_df['score'].astype(float)
    
    end_time = time.time()
    print(f"Prediction generation completed in {end_time - start_time:.2f} seconds")
    
    return predictions_df

def mind_optimized_recommender(behavior_data, test_behavior_data, max_items=50000, batch_size=1000):
    """
    Complete pipeline for MIND-optimized hybrid recommender.
    
    Args:
        behavior_data: DataFrame containing training data
        test_behavior_data: DataFrame containing test data
        max_items: Maximum number of items to keep
        batch_size: Batch size for similarity computation
    
    Returns:
        DataFrame with impression_id, news_id, and score columns
    """
    print("Starting MIND-optimized recommender...")
    overall_start = time.time()
    
    # Process interactions
    interactions_df = process_interactions(behavior_data)
    
    # Create matrices
    (user_item_matrix, 
     user_id_to_idx, article_id_to_idx, 
     idx_to_user_id, idx_to_article_id,
     item_popularity) = create_matrices(
        interactions_df,
        max_items=max_items
    )
    
    # Create item similarity matrix
    item_similarities = create_item_similarity_matrix(
        user_item_matrix,
        batch_size=batch_size
    )
    
    # Generate predictions
    predictions_df = generate_predictions(
        test_behavior_data,
        user_item_matrix,
        item_similarities,
        user_id_to_idx,
        article_id_to_idx,
        idx_to_article_id,
        item_popularity
    )
    
    overall_end = time.time()
    print(f"Pipeline completed in {overall_end - overall_start:.2f} seconds")
    print(f"Generated {len(predictions_df)} predictions")
    
    return predictions_df

# Example usage:
predictions_df = mind_optimized_recommender(
     behavior_data, 
     test_behavior_data,
    max_items=50000,
    batch_size=1000
 )
# predictions_df.to_csv('mind_optimized_predictions.csv', index=False)

Starting MIND-optimized recommender...
Processing interactions...


Extracting interactions:  99%|█████████▉| 2205937/2232748 [01:21<00:01, 23123.02it/s]

In [None]:
import lib.eval as eval
import importlib

importlib.reload(eval)

eval.evaluate_mind_predictions(predictions_df, test_behavior_data)