In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Loading the datasets

In [2]:
news_data = pd.read_csv("data/MINDlarge_train/news.tsv", header=None, sep='\t')
news_data.columns = ['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

news_data.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [3]:
behavior_data = pd.read_csv("data/MINDlarge_train/behaviors.tsv", header=None, sep='\t')
behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

behavior_data.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


## Now we have to create the user-item matrix

In [5]:
# Create an empty list to store interactions
interactions = []

# Iterate over each row in behavior_data to extract user interactions
for idx, row in behavior_data.iterrows():
    user = row['user_id']
    # The history is a space-separated list of article IDs (clicked articles)
    history = row['history']
    if pd.notna(history) and history.strip() != "":
        articles = history.split()  # split by space
        for article in articles:
            interactions.append({'user_id': user, 'article_id': article})

# Create a DataFrame for user-item interactions
interactions_df = pd.DataFrame(interactions)
print(interactions_df.head())

  user_id article_id
0  U87243      N8668
1  U87243     N39081
2  U87243     N65259
3  U87243     N79529
4  U87243     N73408


In [6]:
# Add a column to indicate an interaction (implicit feedback)
interactions_df['interaction'] = 1

# Create a pivot table (user-item matrix)
user_item_matrix = interactions_df.pivot_table(index='user_id', 
                                                 columns='article_id', 
                                                 values='interaction', 
                                                 fill_value=0)

# Inspect the user-item matrix
user_item_matrix.head()

  user_item_matrix = interactions_df.pivot_table(index='user_id',


: 

In [None]:
# Compute cosine similarity between items
# Transpose so that each item is represented by a vector of user interactions
item_similarity = cosine_similarity(user_item_matrix.T)

# Create a DataFrame for the similarity matrix with article IDs as labels
item_similarity_df = pd.DataFrame(item_similarity, 
                                  index=user_item_matrix.columns, 
                                  columns=user_item_matrix.columns)

# View a snippet of the item similarity matrix
item_similarity_df.head()

In [None]:
def get_item_recommendations(user_id, user_item_matrix, item_similarity_df, top_n=5):
    """
    Generate top-n item recommendations for a given user using item-item collaborative filtering.
    
    Parameters:
      user_id: The user identifier as in the user_item_matrix index.
      user_item_matrix: DataFrame where rows are users and columns are items.
      item_similarity_df: DataFrame with precomputed cosine similarity between items.
      top_n: Number of recommendations to return.
    
    Returns:
      A list of recommended article IDs.
    """
    # Get the items the user has interacted with
    user_data = user_item_matrix.loc[user_id]
    interacted_items = user_data[user_data > 0].index.tolist()
    
    # Initialize a dictionary to accumulate similarity scores for candidate items
    scores = {}
    
    # For each item the user has clicked, add the similarity scores of its neighbors
    for item in interacted_items:
        similar_items = item_similarity_df[item]
        for sim_item, score in similar_items.iteritems():
            if sim_item not in interacted_items:  # Exclude items the user already interacted with
                scores[sim_item] = scores.get(sim_item, 0) + score
                
    # Sort the candidate items by aggregated similarity score in descending order
    recommended_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top-n article IDs
    return [item for item, score in recommended_items[:top_n]]


In [None]:
# Select a sample user (e.g., the first user in the matrix)
sample_user = user_item_matrix.index[0]

# Generate top 5 recommendations for the sample user
recommended_articles = get_item_recommendations(sample_user, user_item_matrix, item_similarity_df, top_n=5)

print(f"Recommendations for user {sample_user}:")
print(recommended_articles)
