In [53]:
import numpy as np
import pandas as pd
import os
import sys
import zipfile
import subprocess

from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
from tqdm.notebook import tqdm
from copy import deepcopy

import json

In [54]:
DATASET = 'MIND_large' 
RAW_PATH = os.path.join('..','..','..', '..', 'data', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [55]:
# Please download the training and validation set from https://msnews.github.io/
# and copy MINDlarge.zip and MINDlarge_dev.zip to the *MIND_large* dir
print('Unzip files...')
f = zipfile.ZipFile(os.path.join(RAW_PATH,'MINDlarge.zip'),'r') 
os.makedirs(os.path.join(RAW_PATH,'train'),exist_ok=True)
for file in f.namelist():
    print("Extract %s"%(file))
    f.extract(file,os.path.join(RAW_PATH,'train'))
f.close()

Unzip files...
Extract entity_embedding.vec
Extract news.tsv
Extract relation_embedding.vec
Extract behaviors.tsv


In [56]:
f = zipfile.ZipFile(os.path.join(RAW_PATH,'MINDlarge_dev.zip'),'r') 
os.makedirs(os.path.join(RAW_PATH,'dev'),exist_ok=True)
for file in f.namelist():
    print("Extract %s"%(file))
    f.extract(file,os.path.join(RAW_PATH,'dev'))
f.close()

Extract entity_embedding.vec
Extract news.tsv
Extract relation_embedding.vec
Extract behaviors.tsv


In [57]:
from tqdm import tqdm
import numpy as np

chunk_size = 5000  # Increase chunk size to reduce iterations
interactions = []
user_freq, item_freq = dict(), dict()

max_rows = 1000000  # Limit the number of rows to process

for d in [os.path.join(RAW_PATH, 'train'), os.path.join(RAW_PATH, 'dev')]:
    file = os.path.join(d, "behaviors.tsv")
    
    total_lines = min(sum(1 for _ in open(file)), max_rows)  # Adjust total lines for the progress bar
    
    with tqdm(total=total_lines, desc=f"Processing {file}", unit="lines", mininterval=1.0) as pbar:
        for chunk in pd.read_csv(file, sep="\t", header=None, chunksize=chunk_size, nrows=max_rows):
            # Preprocess chunk impressions
            for _, row in chunk.iterrows():
                sid, uid, time, _, impressions = row
                impressions = [imp.split("-") for imp in impressions.split(" ")]
                
                # Update interactions
                interactions.extend([[sid, uid, time, iid, label] for iid, label in impressions])
                
                # Batch update frequencies for positive interactions
                for iid, label in impressions:
                    if int(label) == 1:
                        user_freq[uid] = user_freq.get(uid, 0) + 1
                        item_freq[iid] = item_freq.get(iid, 0) + 1

            # Update progress bar for processed chunk size
            pbar.update(len(chunk))

# Sampling a fraction of interactions if needed
SAMPLE_FRACTION = 0.01  # Process only 1% of data to reduce memory usage
interactions = np.array(interactions)
interactions = interactions[np.random.choice(len(interactions), int(len(interactions) * SAMPLE_FRACTION), replace=False)].tolist()

# Filter to retain only the top MAX_USERS and MAX_ITEMS by frequency
MAX_USERS = 1000  # Limit the number of users
MAX_ITEMS = 300  # Limit the number of items

# Retain only the top users and items based on frequency
top_users = set(sorted(user_freq, key=user_freq.get, reverse=True)[:MAX_USERS])
top_items = set(sorted(item_freq, key=item_freq.get, reverse=True)[:MAX_ITEMS])

# Filter interactions to include only top users and items
filtered_interactions = [
    interaction for interaction in interactions
    if interaction[1] in top_users and interaction[3] in top_items
]

# Update interactions and frequencies with filtered data
interactions = filtered_interactions
user_freq = {u: user_freq[u] for u in top_users}
item_freq = {i: item_freq[i] for i in top_items}


Processing ..\..\..\..\data\MIND_large\train\behaviors.tsv: 100%|██████████| 1000000/1000000 [01:16<00:00, 13000.33lines/s]
Processing ..\..\..\..\data\MIND_large\dev\behaviors.tsv: 100%|██████████| 1000000/1000000 [01:54<00:00, 8709.15lines/s]
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001380384C880>>
Traceback (most recent call last):
  File "C:\Users\Max\Experiment-Design-Project\venv\lib\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
KeyboardInterrupt: 


In [58]:
interactions_original = interactions.copy()

In [59]:
# 5-core filtering with reduced threshold
MIN_INTERACTIONS = 5  # Reduced from 5 to make it lightweight

# Step 1: Initial filtering of users and items
select_uid, select_iid = [], []
for u in user_freq:
    if user_freq[u] >= MIN_INTERACTIONS:
        select_uid.append(u)
for i in item_freq:
    if item_freq[i] >= MIN_INTERACTIONS:
        select_iid.append(i)

print("User: %d/%d, Item: %d/%d" % (len(select_uid), len(user_freq), len(select_iid), len(item_freq)))

# Step 2: Iterative filtering to ensure all users/items meet the threshold
while len(select_uid) < len(user_freq) or len(select_iid) < len(item_freq):
    # Convert to sets for faster lookups
    select_uid = set(select_uid)
    select_iid = set(select_iid)

    # Reset frequency counters and filtered interactions
    user_freq, item_freq = dict(), dict()
    interactions_5core = []

    # Filter interactions based on current user and item sets
    for line in tqdm(interactions):
        uid, iid, label = line[1], line[3], line[-1]
        if uid in select_uid and iid in select_iid:
            interactions_5core.append(line)
            if int(label) == 1:
                user_freq[uid] = user_freq.get(uid, 0) + 1
                item_freq[iid] = item_freq.get(iid, 0) + 1

    # Update the interactions list
    interactions = interactions_5core

    # Recompute selected users and items meeting the threshold
    select_uid, select_iid = [], []
    for u in user_freq:
        if user_freq[u] >= MIN_INTERACTIONS:
            select_uid.append(u)
    for i in item_freq:
        if item_freq[i] >= MIN_INTERACTIONS:
            select_iid.append(i)

    print("User: %d/%d, Item: %d/%d" % (len(select_uid), len(user_freq), len(select_iid), len(item_freq)))

# Final summary of filtered interactions
print("Selected Interactions: %d, Users: %d, Items: %d" % (len(interactions), len(select_uid), len(select_iid)))


User: 1000/1000, Item: 300/300
Selected Interactions: 3186, Users: 1000, Items: 300


In [60]:
# exclude illegal interactions
for i in range(len(interactions)):
    if len(interactions[i])>5:
        interactions[i] = interactions[i][:-1]

In [61]:
# Get timestamp
format_t = '%m/%d/%Y %I:%M:%S %p'
ts, time = [], []
for i in tqdm(range(len(interactions))):
    t = datetime.strptime(interactions[i][2],format_t)
    ts.append(t)
    time.append(t.timestamp())

100%|██████████| 3186/3186 [00:00<00:00, 105663.51it/s]


In [62]:
# Construct 5 core results with situation context
interaction_df = pd.DataFrame(interactions,columns = ["session_id","user_id","time_str","news_id","label"])
interaction_df['time'] = time
interaction_df['timestamp'] = ts
interaction_df['hour'] = interaction_df['timestamp'].apply(lambda x: x.hour)
interaction_df['weekday'] = interaction_df['timestamp'].apply(lambda x: x.weekday())
interaction_df['date'] = interaction_df['timestamp'].apply(lambda x: x.date())

def get_time_range(hour): # according to the Britannica dictionary
    # https://www.britannica.com/dictionary/eb/qa/parts-of-the-day-early-morning-late-morning-etc
    if hour>=5 and hour<=8:
        return 0
    if hour>8 and hour<11:
        return 1
    if hour>=11 and hour<=12:
        return 2
    if hour>12 and hour<=15:
        return 3
    if hour>15 and hour<=17:
        return 4
    if hour>=18 and hour<=19:
        return 5
    if hour>19 and hour<=21:
        return 6
    if hour>21:
        return 7
    return 8 # 0-4 am

interaction_df['period'] = interaction_df.hour.apply(lambda x: get_time_range(x))
min_date = interaction_df.date.min()
interaction_df['day'] = (interaction_df.date - min_date).apply(lambda x: x.days)

In [63]:
# Save 2-core interactions
interaction_df.to_csv(f"interaction_{MIN_INTERACTIONS}core.csv",index=False)

----
# Prepare data for CTR & Reranking task

1. Rename and organize all interaction features
2. Split dataset into training, validation, and test; Save interaction files (same time indicates same impression)
3. Organize item metadata

In [64]:
CTR_PATH='./MINDCTR/'
os.makedirs(CTR_PATH,exist_ok=True)

In [65]:
# copy interaction file, rename and re-id all features
interaction_ctr = interaction_df.copy()
interaction_ctr.rename(columns={'hour':'c_hour_c','weekday':'c_weekday_c','period':'c_period_c','day':'c_day_f',
                              'user_id':'original_user_id'},
                     inplace=True)
user2newid_ctr = dict(zip(sorted(interaction_ctr.original_user_id.unique()), 
                      range(1,interaction_ctr.original_user_id.nunique()+1)))
interaction_ctr['user_id'] = interaction_ctr.original_user_id.apply(lambda x: user2newid_ctr[x])

item2newid_ctr = dict(zip(sorted(interaction_ctr.news_id.unique()), 
                      range(1,interaction_ctr.news_id.nunique()+1)))
interaction_ctr['item_id'] = interaction_ctr['news_id'].apply(lambda x: item2newid_ctr[x])
interaction_ctr.sort_values(by=['user_id','time'],inplace=True)
interaction_ctr = interaction_ctr.reset_index(drop=True)

json.dump(user2newid_ctr,open(os.path.join(CTR_PATH,"user2newid.json"),'w'))
json.dump(item2newid_ctr,open(os.path.join(CTR_PATH,"item2newid.json"),'w'))

In [66]:
# Count statistics
for col in interaction_ctr.columns:
    if col in ['user_id','item_id'] or col.startswith('c_'):
        print(col, interaction_ctr[col].nunique())

c_hour_c 24
c_weekday_c 6
c_period_c 9
c_day_f 6
user_id 912
item_id 299


In [67]:
# split training, validation, and test sets.
split_time1 = 5
train = interaction_ctr.loc[interaction_ctr.c_day_f<=split_time1].copy()
val_test = interaction_ctr.loc[(interaction_ctr.c_day_f>split_time1)].copy()
val_test.sort_values(by='time',inplace=True)
sessionbyTime = []
last_s = -1
for s in val_test.session_id:
    if s!=last_s:
        sessionbyTime.append(s)
        last_s = s
val = val_test.loc[val_test.session_id.isin(sessionbyTime[:len(sessionbyTime)//2])].copy()
test = val_test.loc[val_test.session_id.isin(sessionbyTime[len(sessionbyTime)//2:])].copy()

# Delete user&item in validation&test sets that not exist in training set
train_u, train_i = set(train.user_id.unique()), set(train.item_id.unique())
val_sel = val.copy()  # Do not filter validation set
test_sel = test.copy()  # Do not filter test set
print("Train user: %d, item: %d"%(len(train_u),len(train_i)))
print("Validation user: %d, item:%d"%(val_sel.user_id.nunique(),val_sel.item_id.nunique()))
print("Test user: %d, item:%d"%(test_sel.user_id.nunique(),test_sel.item_id.nunique()))

Train user: 912, item: 299
Validation user: 0, item:0
Test user: 0, item:0


In [68]:
# Save interaction data
select_columns = ['user_id','item_id','time','label','c_hour_c','c_weekday_c','c_period_c','c_day_f']
train[select_columns].to_csv(os.path.join(CTR_PATH,'train.csv'),sep="\t",index=False)
val_sel[select_columns].to_csv(os.path.join(CTR_PATH,'dev.csv'),sep="\t",index=False)
test_sel[select_columns].to_csv(os.path.join(CTR_PATH,'test.csv'),sep="\t",index=False)

In [69]:
# organize & save item metadata
item_meta_train = pd.read_csv(os.path.join(RAW_PATH,'train',"news.tsv"),sep="\t",header=None)
item_meta_train.columns = ['news_id','category','subcategory','title','abstract','url','title_entitiy','abstract_entity']
item_select = item_meta_train.loc[item_meta_train.news_id.isin(interaction_ctr.news_id.unique())].copy()
item_select['item_id'] = item_select.news_id.apply(lambda x: item2newid_ctr[x])
category2id = dict(zip(sorted(item_select.category.unique()),range(1,item_select.category.nunique()+1)))
subcategory2id = dict(zip(sorted(item_select.subcategory.unique()),range(1,item_select.subcategory.nunique()+1)))
item_select['i_category_c'] = item_select['category'].apply(lambda x: category2id[x])
item_select['i_subcategory_c'] = item_select['subcategory'].apply(lambda x: subcategory2id[x])
item_select[['item_id','i_category_c','i_subcategory_c']].to_csv(
    os.path.join(CTR_PATH,'item_meta.csv'),sep="\t",index=False)

# Prepare data for Top-k Recommendation Task
1. Rename all interaction features
2. Split dataset into training, validation, and test
3. Re-assign IDs to user, item, and context; Save interaction files
4. Organize item metadata

In [70]:
TOPK_PATH='./MINDTOPK/'
os.makedirs(TOPK_PATH,exist_ok=True)

In [71]:
interaction_df = pd.read_csv("interaction_5core.csv")

In [72]:
interaction_df.head(2)

Unnamed: 0,session_id,user_id,time_str,news_id,label,time,timestamp,hour,weekday,date,period,day
0,171474,U140470,11/13/2019 8:32:50 PM,N5287,0,1573674000.0,2019-11-13 20:32:50,20,2,2019-11-13,6,4
1,708259,U96447,11/12/2019 8:10:46 PM,N122819,0,1573586000.0,2019-11-12 20:10:46,20,1,2019-11-12,6,3


In [73]:
# copy & rename columns
interaction_pos = interaction_df.loc[interaction_df.label==1].copy() # retain positive interactions
interaction_pos.rename(columns={'hour':'c_hour_c','weekday':'c_weekday_c','period':'c_period_c','day':'c_day_f',
                              'user_id':'original_user_id'}, inplace=True)

In [74]:
# split training, validation, and test sets.
split_time1 = 5
train = interaction_pos.loc[interaction_pos.c_day_f<=split_time1].copy()
val_test = interaction_pos.loc[(interaction_pos.c_day_f>split_time1)].copy()
val_test.sort_values(by='time',inplace=True)
sessionbyTime = []
last_s = -1
for s in val_test.session_id:
    if s!=last_s:
        sessionbyTime.append(s)
        last_s = s
val = val_test.loc[val_test.session_id.isin(sessionbyTime[:len(sessionbyTime)//2])].copy()
test = val_test.loc[val_test.session_id.isin(sessionbyTime[len(sessionbyTime)//2:])].copy()

# Delete user&item in validation&test sets that not exist in training set
train_u, train_i = set(train.original_user_id.unique()), set(train.news_id.unique())
val_sel = val.loc[(val.original_user_id.isin(train_u))&(val.news_id.isin(train_i))].copy()
test_sel = test.loc[(test.original_user_id.isin(train_u))&(test.news_id.isin(train_i))].copy()
print("Train user: %d, item: %d"%(len(train_u),len(train_i)))
print("Validation user: %d, item:%d"%(val_sel.original_user_id.nunique(),val_sel.news_id.nunique()))
print("Test user: %d, item:%d"%(test_sel.original_user_id.nunique(),test_sel.news_id.nunique()))

Train user: 245, item: 172
Validation user: 0, item:0
Test user: 0, item:0


In [75]:
# Assign ids for users and items (to generate continous ids)
all_df = pd.concat([train,val_sel,test_sel],axis=0)
user2newid_topk = dict(zip(sorted(all_df.original_user_id.unique()), 
                      range(1,all_df.original_user_id.nunique()+1)))
 
for df in [train,val_sel,test_sel]:
    df['user_id'] = df.original_user_id.apply(lambda x: user2newid_topk[x])

item2newid_topk = dict(zip(sorted(all_df.news_id.unique()), 
                      range(1,all_df.news_id.nunique()+1)))
for df in [train,val_sel,test_sel]:
    df['item_id'] = df['news_id'].apply(lambda x: item2newid_topk[x])

all_df['user_id'] = all_df.original_user_id.apply(lambda x: user2newid_topk[x])
all_df['item_id'] = all_df['news_id'].apply(lambda x: item2newid_topk[x])

json.dump(user2newid_topk,open(os.path.join(TOPK_PATH,"user2newid.json"),'w'))
json.dump(item2newid_topk,open(os.path.join(TOPK_PATH,"item2newid.json"),'w'))

In [76]:
# generate negative items
def generate_negative(data_df,all_items,clicked_item_set,random_seed,neg_item_num=5):
    np.random.seed(random_seed)
    neg_items = np.random.choice(all_items, (len(data_df),neg_item_num))
    for i, uid in tqdm(enumerate(data_df['user_id'].values)):
        user_clicked = clicked_item_set[uid]
        for j in range(len(neg_items[i])):
            while neg_items[i][j] in user_clicked|set(neg_items[i][:j]):
                neg_items[i][j] = np.random.choice(all_items, 1)
    return neg_items.tolist()

clicked_item_set = dict()
for user_id, seq_df in all_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
all_items = all_df.item_id.unique()
val_sel['neg_items'] = generate_negative(val_sel,all_items,clicked_item_set,random_seed=1)
test_sel['neg_items'] = generate_negative(test_sel,all_items,clicked_item_set,random_seed=2)

0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [77]:
select_columns = ['user_id','item_id','time','c_hour_c','c_weekday_c','c_period_c','c_day_f']
train[select_columns].to_csv(os.path.join(TOPK_PATH,'train.csv'),sep="\t",index=False)
val_sel[select_columns+['neg_items']].to_csv(os.path.join(TOPK_PATH,'dev.csv'),sep="\t",index=False)
test_sel[select_columns+['neg_items']].to_csv(os.path.join(TOPK_PATH,'test.csv'),sep="\t",index=False)

In [78]:
# organize & save item metadata
item_meta_train = pd.read_csv(os.path.join(RAW_PATH,'train',"news.tsv"),sep="\t",header=None)
item_meta_train.columns = ['news_id','category','subcategory','title','abstract','url','title_entitiy','abstract_entity']
item_select = item_meta_train.loc[item_meta_train.news_id.isin(interaction_pos.news_id.unique())].copy()
item_select['item_id'] = item_select.news_id.apply(lambda x: item2newid_ctr[x])
category2id = dict(zip(sorted(item_select.category.unique()),range(1,item_select.category.nunique()+1)))
subcategory2id = dict(zip(sorted(item_select.subcategory.unique()),range(1,item_select.subcategory.nunique()+1)))
item_select['i_category_c'] = item_select['category'].apply(lambda x: category2id[x])
item_select['i_subcategory_c'] = item_select['subcategory'].apply(lambda x: subcategory2id[x])
item_select[['item_id','i_category_c','i_subcategory_c']].to_csv(
    os.path.join(TOPK_PATH,'item_meta.csv'),sep="\t",index=False)