In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Loading the datasets

In [2]:
news_data = pd.read_csv("data/MINDlarge_train/news.tsv", header=None, sep='\t')
news_data.columns = ['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

news_data.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [3]:
behavior_data = pd.read_csv("data/MINDlarge_train/behaviors.tsv", header=None, sep='\t')
behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


## Now we have to create the user-item matrix

In [4]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # Optional for progress tracking

def process_interactions_efficiently(behavior_data, batch_size=10000):
    """
    Process behavior data to extract user-item interactions efficiently using batching.
    """
    total_batches = (len(behavior_data) + batch_size - 1) // batch_size
    interaction_dfs = []
    
    for batch_num in range(total_batches):
        # Get a batch of the data
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(behavior_data))
        batch = behavior_data.iloc[start_idx:end_idx]
        
        # Filter rows with valid history
        valid_rows = batch[batch['history'].notna() & (batch['history'].str.strip() != '')]
        
        if len(valid_rows) > 0:
            # Apply vectorized operations
            temp_df = valid_rows[['user_id', 'history']].copy()
            temp_df['article_id'] = temp_df['history'].str.split()
            # Explode to create one row per user-article interaction
            temp_df = temp_df.explode('article_id')
            # Keep only the columns we need
            temp_df = temp_df[['user_id', 'article_id']]
            
            interaction_dfs.append(temp_df)
    
    # Combine all batches into final dataframe
    if interaction_dfs:
        interactions_df = pd.concat(interaction_dfs, ignore_index=True)
    else:
        interactions_df = pd.DataFrame(columns=['user_id', 'article_id'])
        
    return interactions_df

def create_sparse_matrices(interactions_df):
    """
    Create sparse user-item matrix and compute sparse item similarity matrix.
    Returns both matrices and mapping dictionaries.
    """
    # Create mappings from IDs to indices
    user_ids = interactions_df['user_id'].unique()
    article_ids = interactions_df['article_id'].unique()
    
    user_id_to_idx = {id: i for i, id in enumerate(user_ids)}
    article_id_to_idx = {id: i for i, id in enumerate(article_ids)}
    
    # Map the original IDs to matrix indices
    user_indices = interactions_df['user_id'].map(user_id_to_idx).values
    article_indices = interactions_df['article_id'].map(article_id_to_idx).values
    
    # Create interaction values (all 1s for implicit feedback)
    interaction_values = np.ones(len(interactions_df), dtype=np.float32)
    
    # Create the sparse user-item matrix
    sparse_user_item = csr_matrix(
        (interaction_values, (user_indices, article_indices)),
        shape=(len(user_ids), len(article_ids))
    )
    
    # Create item-item similarity matrix (cosine similarity between items)
    print("Computing item similarity matrix (this might take a while)...")
    sparse_item_similarity = cosine_similarity(sparse_user_item.T, dense_output=False)
    
    # Create reverse mappings to convert back to original IDs
    idx_to_user_id = {i: id for id, i in user_id_to_idx.items()}
    idx_to_article_id = {i: id for id, i in article_id_to_idx.items()}
    
    return (sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, 
            idx_to_user_id, idx_to_article_id)

def get_item_recommendations_sparse(user_id, 
                                  sparse_user_item, 
                                  sparse_item_similarity,
                                  user_id_to_idx, 
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  top_n=5):
    """
    Generate top-n item recommendations for a given user using sparse matrices.
    """
    # Convert user_id to matrix index
    if user_id not in user_id_to_idx:
        return []
    
    user_idx = user_id_to_idx[user_id]
    
    # Get items the user has interacted with
    user_interactions = sparse_user_item[user_idx].toarray().flatten()
    interacted_item_indices = np.where(user_interactions > 0)[0]
    
    if len(interacted_item_indices) == 0:
        return []  # User has no interactions, cannot recommend
    
    # Initialize scores array for all items
    scores = np.zeros(sparse_item_similarity.shape[0])
    
    # For each item the user has interacted with
    for item_idx in interacted_item_indices:
        # Get similarity scores for this item with all other items
        similarity_scores = sparse_item_similarity[item_idx].toarray().flatten()
        # Add to accumulated scores
        scores += similarity_scores
    
    # Set scores of items the user has already interacted with to -1 (to exclude them)
    scores[interacted_item_indices] = -1
    
    # Get indices of top_n items with highest scores
    recommended_indices = np.argsort(scores)[::-1][:top_n]
    
    # Convert indices back to article IDs
    recommended_articles = [idx_to_article_id[idx] for idx in recommended_indices if scores[idx] > 0]
    
    return recommended_articles

def batch_generate_recommendations(user_ids, 
                                  sparse_user_item, 
                                  sparse_item_similarity,
                                  user_id_to_idx, 
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  top_n=5):
    """
    Generate recommendations for multiple users efficiently.
    """
    recommendations = {}
    
    for user_id in tqdm(user_ids, desc="Generating recommendations"):
        recs = get_item_recommendations_sparse(
            user_id, 
            sparse_user_item, 
            sparse_item_similarity,
            user_id_to_idx, 
            article_id_to_idx,
            idx_to_article_id,
            top_n=top_n
        )
        recommendations[user_id] = recs
        
    return recommendations


def collaborative_filtering_pipeline(behavior_data, top_n=5, sample_users=None):
    """
    Complete pipeline for collaborative filtering.
    
    Parameters:
    -----------
    behavior_data : DataFrame
        DataFrame containing user_id and history columns
    top_n : int
        Number of recommendations per user
    sample_users : list or None
        List of specific user_ids to generate recommendations for,
        or None to use all users
        
    Returns:
    --------
    tuple: (recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id)
    """
    print("Processing interactions...")
    interactions_df = process_interactions_efficiently(behavior_data)
    
    print("Creating sparse matrices...")
    (sparse_user_item, sparse_item_similarity, 
     user_id_to_idx, article_id_to_idx, 
     idx_to_user_id, idx_to_article_id) = create_sparse_matrices(interactions_df)
    
    print(f"User-item matrix shape: {sparse_user_item.shape}")
    print(f"Density: {sparse_user_item.nnz / (sparse_user_item.shape[0] * sparse_user_item.shape[1]):.6f}")
    
    if sample_users is None:
        # Use all users (or first 100 for demonstration)
        sample_users = list(user_id_to_idx.keys())[:500]  # Limit for demonstration
    
    print(f"Generating recommendations for {len(sample_users)} users...")
    recommendations = batch_generate_recommendations(
        sample_users,
        sparse_user_item, 
        sparse_item_similarity,
        user_id_to_idx, 
        article_id_to_idx,
        idx_to_article_id,
        top_n=top_n
    )
    
    # Return all necessary variables for evaluation
    return (recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id)


## Evaluating

In [5]:
test_behavior_data = pd.read_csv("data/MINDlarge_dev/behaviors.tsv", header=None, sep='\t')
test_behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

test_behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U134050,11/15/2019 8:55:22 AM,N12246 N128820 N119226 N4065 N67770 N33446 N10...,N91737-0 N30206-0 N54368-0 N117802-0 N18190-0 ...
1,2,U254959,11/15/2019 11:42:35 AM,N34011 N9375 N67397 N7936 N118985 N109453 N103...,N119999-0 N24958-0 N104054-0 N33901-0 N9250-0 ...
2,3,U499841,11/15/2019 9:08:21 AM,N63858 N26834 N6379 N85484 N15229 N65119 N1047...,N18190-0 N89764-0 N91737-0 N54368-0 N49978-1 N...
3,4,U107107,11/15/2019 5:50:31 AM,N12959 N8085 N18389 N3758 N9740 N90543 N129790...,N122944-1 N18190-0 N55801-0 N59297-0 N128045-0...
4,5,U492344,11/15/2019 5:02:25 AM,N109183 N48453 N85005 N45706 N98923 N46069 N35...,N64785-0 N82503-0 N32993-0 N122944-0 N29160-0 ...


In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def generate_mind_predictions(test_behavior_data, 
                                         sparse_user_item, 
                                         sparse_item_similarity,
                                         user_id_to_idx, 
                                         article_id_to_idx,
                                         idx_to_article_id):
    """
    Generate prediction scores with compatible datatypes for evaluation.
    """
    predictions = []
    
    for idx, row in tqdm(test_behavior_data.iterrows(), total=len(test_behavior_data)):
        # Get impression_id (use row index if not in columns)
        impression_id = row.get('impression_id', idx)
        user_id = row['user_id']
        
        if pd.isna(row['impressions']):
            continue
            
        # Parse impression articles
        for imp in row['impressions'].split():
            parts = imp.split('-')
            article_id = parts[0]
            
            # Calculate score (using default 0.5 for simplicity in this example)
            score = 0.5
            
            # If user and article exist in our model, calculate real score
            if user_id in user_id_to_idx and article_id in article_id_to_idx:
                user_idx = user_id_to_idx[user_id]
                article_idx = article_id_to_idx[article_id]
                
                # Get user's interaction history
                user_interactions = sparse_user_item[user_idx].toarray().flatten()
                interacted_item_indices = np.where(user_interactions > 0)[0]
                
                if len(interacted_item_indices) > 0:
                    # Calculate similarity-based score
                    total_sim = 0
                    
                    for item_idx in interacted_item_indices:
                        # Get similarity safely
                        sim_matrix = sparse_item_similarity[item_idx].toarray().flatten()
                        if article_idx < len(sim_matrix):
                            total_sim += sim_matrix[article_idx]
                    
                    # Normalize to 0-1 range (simple approach)
                    # This is a simplified normalization - adjust if needed
                    score = min(1.0, max(0.0, total_sim / max(1, len(interacted_item_indices))))
            
            # Append to predictions list
            predictions.append({
                'impression_id': impression_id,
                'news_id': article_id,
                'score': float(score)  # Ensure float
            })
    
    # Create DataFrame with explicit dtypes
    predictions_df = pd.DataFrame(predictions)
    
    # Ensure correct datatypes for Polars conversion
    predictions_df['impression_id'] = predictions_df['impression_id'].astype(str)
    predictions_df['news_id'] = predictions_df['news_id'].astype(str)
    predictions_df['score'] = predictions_df['score'].astype(float)
    
    
    
    return predictions_df

# Example usage:
# This code should be placed after running the collaborative_filtering_pipeline

(recommendations, sparse_user_item, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, idx_to_user_id, idx_to_article_id) = collaborative_filtering_pipeline(behavior_data, top_n=5)

sample_size = 30000  # Adjust this based on your needs
test_sample = test_behavior_data.sample(n=sample_size, random_state=42)

# Generate predictions for test data
predictions_df = generate_mind_predictions(
    test_sample,
    sparse_user_item, 
    sparse_item_similarity,
    user_id_to_idx, 
    article_id_to_idx,
    idx_to_article_id
)

# You can then save predictions to a CSV file for submission or evaluation
predictions_df.to_csv('mind_predictions.csv', index=False)

print(f"Generated {len(predictions_df)} predictions")
predictions_df.head()

Processing interactions...
Creating sparse matrices...
Computing item similarity matrix (this might take a while)...
User-item matrix shape: (698365, 79546)
Density: 0.000237
Generating recommendations for 500 users...


Generating recommendations: 100%|██████████| 500/500 [00:05<00:00, 95.21it/s] 
100%|██████████| 30000/30000 [04:01<00:00, 124.09it/s]


Generated 1128901 predictions


Unnamed: 0,impression_id,news_id,score
0,181577,N83707,0.5
1,181577,N26122,0.5
2,181577,N32993,0.5
3,181577,N80770,0.5
4,181577,N86609,0.5


In [7]:
predictions_df['impression_id'] = predictions_df['impression_id'].astype('int64')  # i64
predictions_df['news_id'] = predictions_df['news_id'].astype(str)                 # str
predictions_df['score'] = predictions_df['score'].astype('float64')  

In [8]:
predictions_df.head()

Unnamed: 0,impression_id,news_id,score
0,181577,N83707,0.5
1,181577,N26122,0.5
2,181577,N32993,0.5
3,181577,N80770,0.5
4,181577,N86609,0.5


In [9]:
import lib.eval as eval
import importlib

importlib.reload(eval)

eval.evaluate_mind_predictions(predictions_df, test_behavior_data)

{'auc': np.float64(0.5122522263430606),
 'mrr': np.float64(0.24543784074002772),
 'ndcg@5': np.float64(0.2479285666195759),
 'ndcg@10': np.float64(0.31242379386645075)}

## Collaborative filtering with ALS

## TG-IDF implementation

In [13]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, diags
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def process_interactions_efficiently(behavior_data, batch_size=10000):
    """
    Process behavior data to extract user-item interactions efficiently using batching.
    """
    total_batches = (len(behavior_data) + batch_size - 1) // batch_size
    interaction_dfs = []
    
    for batch_num in range(total_batches):
        # Get a batch of the data
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(behavior_data))
        batch = behavior_data.iloc[start_idx:end_idx]
        
        # Filter rows with valid history
        valid_rows = batch[batch['history'].notna() & (batch['history'].str.strip() != '')]
        
        if len(valid_rows) > 0:
            # Apply vectorized operations
            temp_df = valid_rows[['user_id', 'history']].copy()
            temp_df['article_id'] = temp_df['history'].str.split()
            # Explode to create one row per user-article interaction
            temp_df = temp_df.explode('article_id')
            # Keep only the columns we need
            temp_df = temp_df[['user_id', 'article_id']]
            
            interaction_dfs.append(temp_df)
    
    # Combine all batches into final dataframe
    if interaction_dfs:
        interactions_df = pd.concat(interaction_dfs, ignore_index=True)
    else:
        interactions_df = pd.DataFrame(columns=['user_id', 'article_id'])
        
    return interactions_df

def create_tfidf_matrices(interactions_df):
    """
    Create sparse user-item matrix with TF-IDF weighting and compute item similarity matrix.
    """
    # Create mappings from IDs to indices
    user_ids = interactions_df['user_id'].unique()
    article_ids = interactions_df['article_id'].unique()
    
    user_id_to_idx = {id: i for i, id in enumerate(user_ids)}
    article_id_to_idx = {id: i for i, id in enumerate(article_ids)}
    
    # Map the original IDs to matrix indices
    user_indices = interactions_df['user_id'].map(user_id_to_idx).values
    article_indices = interactions_df['article_id'].map(article_id_to_idx).values
    
    # Create interaction values (all 1s for implicit feedback)
    interaction_values = np.ones(len(interactions_df), dtype=np.float32)
    
    # Create the basic sparse user-item matrix
    sparse_user_item = csr_matrix(
        (interaction_values, (user_indices, article_indices)),
        shape=(len(user_ids), len(article_ids))
    )
    
    # Compute TF (Term Frequency) - normalized by user interaction count
    # This represents how important an article is to a user
    user_interaction_counts = np.array(sparse_user_item.sum(axis=1)).flatten()
    user_interaction_counts[user_interaction_counts == 0] = 1.0  # Avoid division by zero
    tf_diag = diags(1.0 / user_interaction_counts, 0)
    tf_matrix = tf_diag.dot(sparse_user_item)
    
    # Compute IDF (Inverse Document Frequency)
    # This downweights popular articles that many users have interacted with
    item_interaction_counts = np.array(sparse_user_item.sum(axis=0)).flatten()
    n_users = sparse_user_item.shape[0]
    idf = np.log(n_users / (item_interaction_counts + 1.0))
    idf_diag = diags(idf, 0)
    
    # Create TF-IDF weighted user-item matrix
    tfidf_matrix = tf_matrix.dot(idf_diag)
    
    # Compute item popularity (for potential use in recommendations)
    item_popularity = item_interaction_counts / np.sum(item_interaction_counts)
    
    # Create item-item similarity matrix using the TF-IDF weighted matrix
    print("Computing item similarity matrix with TF-IDF weighting (this might take a while)...")
    sparse_item_similarity = cosine_similarity(tfidf_matrix.T, dense_output=False)
    
    # Create reverse mappings to convert back to original IDs
    idx_to_user_id = {i: id for id, i in user_id_to_idx.items()}
    idx_to_article_id = {i: id for id, i in article_id_to_idx.items()}
    
    return (tfidf_matrix, sparse_item_similarity, 
            user_id_to_idx, article_id_to_idx, 
            idx_to_user_id, idx_to_article_id,
            item_popularity)

def get_item_recommendations_with_tfidf(user_id, 
                                     tfidf_matrix, 
                                     sparse_item_similarity,
                                     user_id_to_idx, 
                                     article_id_to_idx,
                                     idx_to_article_id,
                                     item_popularity,
                                     top_n=5):
    """
    Generate top-n item recommendations for a given user using TF-IDF weighted similarity.
    """
    # Convert user_id to matrix index
    if user_id not in user_id_to_idx:
        return []
    
    user_idx = user_id_to_idx[user_id]
    
    # Get items the user has interacted with (from TF-IDF matrix)
    user_interactions = tfidf_matrix[user_idx].toarray().flatten()
    interacted_item_indices = np.where(user_interactions > 0)[0]
    
    if len(interacted_item_indices) == 0:
        return []  # User has no interactions, cannot recommend
    
    # Initialize scores array for all items
    scores = np.zeros(sparse_item_similarity.shape[0])
    
    # For each item the user has interacted with
    for item_idx in interacted_item_indices:
        # Get similarity scores for this item with all other items
        similarity_scores = sparse_item_similarity[item_idx].toarray().flatten()
        
        # Weight by the user's TF-IDF score for this item
        # This gives more importance to items that are more significant to the user
        item_weight = user_interactions[item_idx]
        scores += similarity_scores * item_weight
    
    # Set scores of items the user has already interacted with to -1 (to exclude them)
    scores[interacted_item_indices] = -1
    
    # Get indices of top_n items with highest scores
    recommended_indices = np.argsort(scores)[::-1][:top_n]
    
    # Convert indices back to article IDs
    recommended_articles = [idx_to_article_id[idx] for idx in recommended_indices if scores[idx] > 0]
    
    return recommended_articles

def generate_mind_predictions_tfidf(test_behavior_data,
                                  tfidf_matrix,
                                  sparse_item_similarity,
                                  user_id_to_idx,
                                  article_id_to_idx,
                                  idx_to_article_id,
                                  item_popularity):
    """
    Generate prediction scores using TF-IDF weighted collaborative filtering.
    """
    predictions = []
    
    for idx, row in tqdm(test_behavior_data.iterrows(), total=len(test_behavior_data)):
        # Get impression_id (use row index if not in columns)
        impression_id = row.get('impression_id', idx)
        user_id = row['user_id']
        
        if pd.isna(row['impressions']):
            continue
            
        # Parse impression articles
        for imp in row['impressions'].split():
            parts = imp.split('-')
            article_id = parts[0]
            
            # Default score
            score = 0.5
            
            # If user and article exist in our model, calculate real score
            if user_id in user_id_to_idx and article_id in article_id_to_idx:
                user_idx = user_id_to_idx[user_id]
                article_idx = article_id_to_idx[article_id]
                
                # Get user's interaction history from TF-IDF matrix
                user_interactions = tfidf_matrix[user_idx].toarray().flatten()
                interacted_item_indices = np.where(user_interactions > 0)[0]
                
                if len(interacted_item_indices) > 0:
                    # Calculate TF-IDF weighted similarity score
                    total_sim = 0
                    weight_sum = 0
                    
                    for item_idx in interacted_item_indices:
                        # Get similarity between this item and target article
                        sim_score = sparse_item_similarity[item_idx, article_idx]
                        if sim_score:
                            # Weight by the user's TF-IDF score for this item
                            item_weight = user_interactions[item_idx]
                            total_sim += sim_score * item_weight
                            weight_sum += item_weight
                    
                    # Compute final score
                    if weight_sum > 0:
                        raw_score = total_sim / weight_sum
                        # Scale to [0, 1] range
                        score = min(1.0, max(0.0, raw_score))
                    elif article_idx < len(item_popularity):
                        # Fallback to item popularity
                        score = 0.4 + 0.2 * item_popularity[article_idx] / max(item_popularity)
            
            # Append to predictions list
            predictions.append({
                'impression_id': impression_id,
                'news_id': article_id,
                'score': float(score)
            })
    
    # Create DataFrame with explicit dtypes
    predictions_df = pd.DataFrame(predictions)
    
    # Ensure correct datatypes for evaluation
    try:
        predictions_df['impression_id'] = predictions_df['impression_id'].astype('int64')
    except:
        predictions_df['impression_id'] = predictions_df['impression_id'].astype(str)
    
    predictions_df['news_id'] = predictions_df['news_id'].astype(str)
    predictions_df['score'] = predictions_df['score'].astype(float)
    
    return predictions_df

def tfidf_collaborative_filtering_pipeline(behavior_data, test_behavior_data, top_n=5):
    """
    Complete pipeline for TF-IDF enhanced collaborative filtering.
    
    Parameters:
    -----------
    behavior_data : DataFrame
        Training data containing user_id and history columns
    test_behavior_data : DataFrame
        Test data for generating predictions
    top_n : int
        Number of recommendations per user
        
    Returns:
    --------
    predictions_df : DataFrame
        DataFrame with impression_id, news_id, and score columns
    """
    print("Processing interactions...")
    interactions_df = process_interactions_efficiently(behavior_data)
    
    print("Creating TF-IDF matrices...")
    (tfidf_matrix, sparse_item_similarity, 
     user_id_to_idx, article_id_to_idx, 
     idx_to_user_id, idx_to_article_id,
     item_popularity) = create_tfidf_matrices(interactions_df)
    
    print(f"User-item matrix shape: {tfidf_matrix.shape}")
    print(f"Density: {tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]):.6f}")
    
    print("Generating predictions...")
    predictions_df = generate_mind_predictions_tfidf(
        test_behavior_data,
        tfidf_matrix, 
        sparse_item_similarity,
        user_id_to_idx, 
        article_id_to_idx,
        idx_to_article_id,
        item_popularity
    )
    
    return predictions_df

# Example usage:
sample_size = 30000  # Adjust based on your needs
test_sample = test_behavior_data.sample(n=sample_size, random_state=42)
predictions_df = tfidf_collaborative_filtering_pipeline(behavior_data, test_sample)

Processing interactions...
Creating TF-IDF matrices...
Computing item similarity matrix with TF-IDF weighting (this might take a while)...
User-item matrix shape: (698365, 79546)
Density: 0.000237
Generating predictions...


100%|██████████| 30000/30000 [01:10<00:00, 423.50it/s]


In [14]:

importlib.reload(eval)

eval.evaluate_mind_predictions(predictions_df, test_behavior_data)

{'auc': np.float64(0.5123415509984646),
 'mrr': np.float64(0.24546161311513062),
 'ndcg@5': np.float64(0.24797189151305932),
 'ndcg@10': np.float64(0.31238071121777455)}

## New METHOD

In [None]:
importlib.reload(eval)

eval.evaluate_mind_predictions(predictions_df, test_behavior_data)

## New Approach

## New Approach

In [26]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, diags
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import time

def process_interactions_efficiently(behavior_data, max_articles_per_user=20, batch_size=10000):
    """
    Process behavior data to extract user-item interactions efficiently.
    Limits the number of articles per user for efficiency and to focus on recent behavior.
    """
    print("Processing interactions...")
    total_batches = (len(behavior_data) + batch_size - 1) // batch_size
    interaction_dfs = []
    
    for batch_num in range(total_batches):
        # Get a batch of the data
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(behavior_data))
        batch = behavior_data.iloc[start_idx:end_idx]
        
        # Filter rows with valid history
        valid_rows = batch[batch['history'].notna() & (batch['history'].str.strip() != '')]
        
        if len(valid_rows) > 0:
            user_interactions = []
            
            for _, row in valid_rows.iterrows():
                user_id = row['user_id']
                history = row['history'].split()
                
                # Take only the most recent articles if there are too many
                if len(history) > max_articles_per_user:
                    history = history[-max_articles_per_user:]
                
                # Assign higher weights to more recent articles
                for i, article_id in enumerate(history):
                    # Position-based weight (recent items get higher weight)
                    position = i / len(history)
                    weight = 0.5 + 0.5 * position  # Ranges from 0.5 to 1.0
                    
                    user_interactions.append({
                        'user_id': user_id,
                        'article_id': article_id,
                        'weight': weight
                    })
            
            if user_interactions:
                interaction_df = pd.DataFrame(user_interactions)
                interaction_dfs.append(interaction_df)
    
    # Combine all batches into final dataframe
    if interaction_dfs:
        interactions_df = pd.concat(interaction_dfs, ignore_index=True)
        
        # Aggregate weights for duplicate user-item pairs
        interactions_df = interactions_df.groupby(['user_id', 'article_id'])['weight'].sum().reset_index()
    else:
        interactions_df = pd.DataFrame(columns=['user_id', 'article_id', 'weight'])
        
    print(f"Processed {len(interactions_df)} interactions from {interactions_df['user_id'].nunique()} users")
    return interactions_df

def create_optimized_item_similarity(interactions_df, max_items=30000):
    """
    Create optimized item similarity matrix focusing only on the most popular items.
    """
    print("Creating optimized item similarity matrix...")
    start_time = time.time()
    
    # Limit to top items if needed
    if max_items and len(interactions_df['article_id'].unique()) > max_items:
        item_counts = interactions_df.groupby('article_id')['weight'].sum().reset_index()
        top_items = item_counts.sort_values('weight', ascending=False).head(max_items)['article_id'].values
        interactions_df = interactions_df[interactions_df['article_id'].isin(top_items)]
        print(f"Limited to top {max_items} items")
    
    # Create mappings from IDs to indices
    user_ids = interactions_df['user_id'].unique()
    article_ids = interactions_df['article_id'].unique()
    
    user_id_to_idx = {id: i for i, id in enumerate(user_ids)}
    article_id_to_idx = {id: i for i, id in enumerate(article_ids)}
    
    # Map the original IDs to matrix indices
    user_indices = interactions_df['user_id'].map(user_id_to_idx).values
    article_indices = interactions_df['article_id'].map(article_id_to_idx).values
    
    # Use weights from interactions
    interaction_values = interactions_df['weight'].values
    
    # Create the sparse user-item matrix
    sparse_user_item = csr_matrix(
        (interaction_values, (user_indices, article_indices)),
        shape=(len(user_ids), len(article_ids))
    )
    
    # Calculate item popularity
    item_popularity = np.array(sparse_user_item.sum(axis=0)).flatten()
    item_popularity = item_popularity / (np.sum(item_popularity) or 1)
    
    # Apply IDF weighting to reduce popularity bias
    # This gives more weight to rare items and less to common ones
    n_users = sparse_user_item.shape[0]
    item_user_counts = np.array((sparse_user_item > 0).sum(axis=0)).flatten()
    idf = np.log(n_users / (item_user_counts + 1))
    idf_diag = diags(idf, 0)
    
    # Apply IDF weighting to user-item matrix
    weighted_user_item = sparse_user_item.dot(idf_diag)
    
    # Normalize the user vectors for better similarity calculation
    user_norms = np.sqrt(weighted_user_item.power(2).sum(axis=1))
    user_norms[user_norms == 0] = 1  # Avoid division by zero
    row_normalizer = diags(1.0 / user_norms.A.flatten(), 0)
    normalized_matrix = row_normalizer.dot(weighted_user_item)
    
    # Compute item similarity
    print("Computing item similarity (this might take a minute or two)...")
    item_similarity = cosine_similarity(normalized_matrix.T, dense_output=False)
    
    # Create reverse mappings
    idx_to_user_id = {i: id for id, i in user_id_to_idx.items()}
    idx_to_article_id = {i: id for id, i in article_id_to_idx.items()}
    
    end_time = time.time()
    print(f"Matrix creation completed in {end_time - start_time:.2f} seconds")
    
    return (sparse_user_item, item_similarity, 
            user_id_to_idx, article_id_to_idx, 
            idx_to_user_id, idx_to_article_id,
            item_popularity)

def generate_optimized_predictions(test_behavior_data,
                                 sparse_user_item,
                                 item_similarity,
                                 user_id_to_idx,
                                 article_id_to_idx,
                                 idx_to_article_id,
                                 item_popularity,
                                 max_history_items=10):
    """
    Generate prediction scores using optimized item-based collaborative filtering.
    Uses only the most recent history items for efficiency.
    """
    print("Generating predictions...")
    start_time = time.time()
    predictions = []
    
    for idx, row in tqdm(test_behavior_data.iterrows(), total=len(test_behavior_data)):
        impression_id = row.get('impression_id', idx)
        user_id = row['user_id']
        
        if pd.isna(row['impressions']):
            continue
            
        # Parse impression articles
        impression_articles = []
        for imp in row['impressions'].split():
            parts = imp.split('-')
            article_id = parts[0]
            impression_articles.append(article_id)
        
        # Get user's history
        history_indices = []
        
        if user_id in user_id_to_idx and pd.notna(row['history']) and row['history'].strip() != "":
            user_idx = user_id_to_idx[user_id]
            history = row['history'].split()
            
            # Limit to most recent items for efficiency
            if len(history) > max_history_items:
                history = history[-max_history_items:]
            
            # Convert history articles to indices
            for article_id in history:
                if article_id in article_id_to_idx:
                    history_indices.append(article_id_to_idx[article_id])
        
        # Score the impression articles
        for article_id in impression_articles:
            # Default score
            score = 0.5
            
            if article_id in article_id_to_idx:
                article_idx = article_id_to_idx[article_id]
                
                if history_indices:
                    # Calculate weighted similarity to history articles
                    total_sim = 0
                    
                    # Apply recency weighting to history items
                    history_weights = np.linspace(0.6, 1.0, len(history_indices))
                    
                    for i, hist_idx in enumerate(history_indices):
                        # Get similarity to this history item
                        sim = item_similarity[hist_idx, article_idx]
                        if sim:
                            # Apply recency weight
                            weight = history_weights[i]
                            total_sim += sim * weight
                    
                    # Normalize by number of history items
                    if len(history_indices) > 0:
                        score = total_sim / sum(history_weights)
                else:
                    # No history - use item popularity
                    score = 0.4 + 0.2 * (item_popularity[article_idx] / max(0.001, np.max(item_popularity)))
            
            # Ensure score is in [0, 1] range
            score = min(1.0, max(0.0, score))
            
            # Add to predictions
            predictions.append({
                'impression_id': impression_id,
                'news_id': article_id,
                'score': float(score)
            })
    
    # Create DataFrame
    predictions_df = pd.DataFrame(predictions)
    
    # Ensure correct datatypes for evaluation
    try:
        predictions_df['impression_id'] = predictions_df['impression_id'].astype('int64')
    except:
        predictions_df['impression_id'] = predictions_df['impression_id'].astype(str)
        
    predictions_df['news_id'] = predictions_df['news_id'].astype(str)
    predictions_df['score'] = predictions_df['score'].astype(float)
    
    end_time = time.time()
    print(f"Prediction generation completed in {end_time - start_time:.2f} seconds")
    
    return predictions_df

def optimized_item_cf_pipeline(behavior_data, test_behavior_data, 
                            max_articles_per_user=20, 
                            max_items=30000,
                            max_history_items=10):
    """
    Complete pipeline for optimized item-based collaborative filtering.
    
    Parameters:
    -----------
    behavior_data : DataFrame
        Training data containing user_id and history columns
    test_behavior_data : DataFrame
        Test data for generating predictions
    max_articles_per_user : int
        Maximum number of articles to consider per user
    max_items : int
        Maximum number of items to include in the model
    max_history_items : int
        Maximum number of history items to use when generating predictions
        
    Returns:
    --------
    predictions_df : DataFrame
        DataFrame with impression_id, news_id, and score columns
    """
    print(f"Starting optimized item-based collaborative filtering pipeline...")
    overall_start = time.time()
    
    # Process interactions
    interactions_df = process_interactions_efficiently(
        behavior_data, 
        max_articles_per_user=max_articles_per_user
    )
    
    # Create matrices
    (sparse_user_item, item_similarity, 
     user_id_to_idx, article_id_to_idx, 
     idx_to_user_id, idx_to_article_id,
     item_popularity) = create_optimized_item_similarity(
        interactions_df, 
        max_items=max_items
    )
    
    # Generate predictions
    predictions_df = generate_optimized_predictions(
        test_behavior_data,
        sparse_user_item, 
        item_similarity,
        user_id_to_idx, 
        article_id_to_idx,
        idx_to_article_id,
        item_popularity,
        max_history_items=max_history_items
    )
    
    overall_end = time.time()
    print(f"Pipeline completed in {overall_end - overall_start:.2f} seconds")
    print(f"Generated {len(predictions_df)} predictions")
    
    return predictions_df

# Example usage:
sample_size = 30000  # Adjust based on your needs
test_sample = test_behavior_data.sample(n=sample_size, random_state=42)
predictions_df = optimized_item_cf_pipeline(
    behavior_data, 
    test_sample,
    max_articles_per_user=100,  # Focus on most recent articles per user
    max_items=180000,           # Focus on most popular articles
    max_history_items=80       # Use only most recent history when predicting
)

Starting optimized item-based collaborative filtering pipeline...
Processing interactions...
Processed 12695274 interactions from 698365 users
Creating optimized item similarity matrix...
Computing item similarity (this might take a minute or two)...
Matrix creation completed in 9.26 seconds
Generating predictions...


100%|██████████| 30000/30000 [00:41<00:00, 723.67it/s]


Prediction generation completed in 41.93 seconds
Pipeline completed in 152.11 seconds
Generated 1128901 predictions


In [27]:
importlib.reload(eval)

eval.evaluate_mind_predictions(predictions_df, test_behavior_data)

{'auc': np.float64(0.5136020804347559),
 'mrr': np.float64(0.24575208262118636),
 'ndcg@5': np.float64(0.24858198129150502),
 'ndcg@10': np.float64(0.3130225646546266)}

## New Approach