In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
behavior_data = pd.read_csv("data/MINDlarge_train/behaviors.tsv", header=None, sep='\t')
behavior_data.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

In [3]:
behavior_data.shape

(2232748, 5)

In [5]:
news_data = pd.read_csv("data/MINDlarge_train/news.tsv", header=None, sep='\t')
news_data.columns = ['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

In [6]:
news_data.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [7]:
timestamps = pd.to_datetime(behavior_data["timestamp"])


timestamps.sort_values()

472830    2019-11-09 00:00:00
526310    2019-11-09 00:00:02
709413    2019-11-09 00:00:03
1626675   2019-11-09 00:00:07
1104143   2019-11-09 00:00:13
                  ...        
110       2019-11-14 23:59:54
1402059   2019-11-14 23:59:54
1108375   2019-11-14 23:59:54
267634    2019-11-14 23:59:59
676390    2019-11-14 23:59:59
Name: timestamp, Length: 2232748, dtype: datetime64[ns]

In [8]:
# Building the baseline Model
# The baseline model is a simple model that recommends the most popular articles from 24 hours ago to the user.

# For every hour in the dataset, get a list of all the observed articles
# Get the count of each article


# Change timestamp to hourly granularity



In [9]:
import pandas as pd
import numpy as np
from collections import defaultdict

# Make a copy and drop NAs
df = behavior_data.copy().dropna()

# Convert timestamp to datetime and create hour column
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.floor('h')

# Use a more efficient approach with a dictionary
hour_to_articles = defaultdict(set)

# Process in chunks
chunk_size = 10000
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i+chunk_size]
    for _, row in chunk.iterrows():
        hour = row['hour']
        articles = set(row['history'].split())
        hour_to_articles[hour].update(articles)

# Convert to DataFrame
hourly_df = pd.DataFrame({
    'hour': list(hour_to_articles.keys()),
    'article_ids': list(hour_to_articles.values())
}).sort_values('hour')

# Set hour as index if needed
hourly_df.set_index('hour', inplace=True)

In [10]:
behavior_data["timestamp"] = pd.to_datetime(behavior_data["timestamp"])

latest_timestamp = pd.to_datetime(behavior_data.sort_values(by='timestamp')["timestamp"].iloc[-1])

# Find timestamp 1 week before the latest timestamp
one_week_before = latest_timestamp - pd.Timedelta(days=1)

# All entries within interval
interval_data = behavior_data[(behavior_data["timestamp"] >= one_week_before) & (behavior_data["timestamp"] <= latest_timestamp)]

interval_data.sort_values(by='timestamp')

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
742827,742828,U514076,2019-11-14 00:00:00,N117191 N94655,N18405-1 N24080-0 N3356-0 N72475-0 N94777-0 N8...
965758,965759,U536339,2019-11-14 00:00:00,N13970 N30388 N113001 N129836 N39785 N73137,N123683-0 N94777-1 N67398-0 N5287-0 N54296-0 N...
612269,612270,U282205,2019-11-14 00:00:00,N92619 N2076 N1940 N118990 N81921 N113167 N116...,N123683-0 N108269-0 N120332-0 N18405-0 N51569-...
1209758,1209759,U488247,2019-11-14 00:00:01,N3519 N26976 N69 N59828 N56272 N101851 N33271 ...,N53199-0 N46928-0 N41975-0 N88091-0 N76936-0 N...
664564,664565,U285602,2019-11-14 00:00:01,N41122 N85484 N63078 N20018 N49351 N81289 N171...,N96192-0 N120708-0 N34781-0 N35236-0 N23734-0 ...
...,...,...,...,...,...
1402059,1402060,U416061,2019-11-14 23:59:54,N35125 N48862 N33286 N84500 N16005,N34980-0 N63322-0 N51651-0 N125720-0 N77013-0 ...
110,111,U286542,2019-11-14 23:59:54,N57681 N112546 N53796 N62205 N116171 N85428 N3...,N82503-0 N82316-0 N26122-0 N19586-0 N80770-0 N...
1108375,1108376,U302411,2019-11-14 23:59:54,N95088,N82503-0 N78508-0 N83707-0 N82316-0 N67937-0 N...
676390,676391,U403739,2019-11-14 23:59:59,N119708 N54948 N112152 N94450 N68960 N39785 N8...,N8236-0 N82503-0 N80770-0 N118623-0 N83707-0 N...


In [11]:
# Get all ids in history
all_ids = interval_data["history"].str.split(" ").explode()
all_ids 

4           N65250
4          N122359
4           N71723
4           N53796
4           N41663
            ...   
2232740    N117074
2232740     N92073
2232740    N120854
2232740    N110982
2232740    N110550
Name: history, Length: 14487078, dtype: object

In [12]:
popular_today = all_ids.value_counts().head(10)
popular_today

history
N104737    57104
N91597     55604
N80126     44473
N9375      41396
N71977     41219
N54360     40820
N128965    39190
N1713      38154
N45124     36735
N128643    36473
Name: count, dtype: int64

In [13]:
news_data[news_data["article_id"].isin(popular_today.index)]

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
4417,N54360,news,newspolitics,Joe Biden reportedly denied Communion at a Sou...,Joe Biden has a complicated history with the C...,https://assets.msn.com/labs/mind/AAJwml6.html,"[{""Label"": ""Joe Biden"", ""Type"": ""P"", ""Wikidata...","[{""Label"": ""Catholic Church"", ""Type"": ""O"", ""Wi..."
4797,N1713,autos,autosnews,Cause determined in Jessi Combs' fatal speed r...,Occurred at speeds near 550 mph,https://assets.msn.com/labs/mind/AAJQHG7.html,"[{""Label"": ""Jessi Combs"", ""Type"": ""P"", ""Wikida...",[]
15209,N128965,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...",https://assets.msn.com/labs/mind/AAIORni.html,[],"[{""Label"": ""Pat Sajak"", ""Type"": ""P"", ""Wikidata..."
25992,N104737,movies,movies-celebrity,Kevin Spacey Won't Be Charged in Sexual Assaul...,The Los Angeles County District Attorney's Off...,https://assets.msn.com/labs/mind/AAJy6rv.html,"[{""Label"": ""Kevin Spacey"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Kevin Spacey"", ""Type"": ""P"", ""Wikid..."
27030,N128643,tv,tv-celebrity,"Miguel Cervantes' Wife Reveals Daughter, 3, 'D...",Miguel Cervantes' Wife Reveals Daughter 'Died ...,https://assets.msn.com/labs/mind/AAIJ2PE.html,"[{""Label"": ""Hospice"", ""Type"": ""C"", ""WikidataId...","[{""Label"": ""Hospice"", ""Type"": ""C"", ""WikidataId..."
32708,N91597,lifestyle,lifestylebuzz,Heidi Klum's 2019 Halloween Costume Transforma...,You might say she's scary good at playing dres...,https://assets.msn.com/labs/mind/AAJFlhi.html,"[{""Label"": ""Heidi Klum"", ""Type"": ""P"", ""Wikidat...","[{""Label"": ""Heidi Klum"", ""Type"": ""P"", ""Wikidat..."
38665,N71977,sports,basketball_nba,Former NBA first-round pick Jim Farmer arreste...,"Farmer, 55, was booked for trafficking a perso...",https://assets.msn.com/labs/mind/AAJBmut.html,"[{""Label"": ""Jim Farmer"", ""Type"": ""P"", ""Wikidat...","[{""Label"": ""Jim Farmer"", ""Type"": ""P"", ""Wikidat..."
50256,N80126,news,newscrime,Four flight attendants were arrested in Miami'...,Four American Airlines flight attendants were ...,https://assets.msn.com/labs/mind/AAJcQKF.html,[],"[{""Label"": ""Miami International Airport"", ""Typ..."
53978,N45124,news,newsopinion,The News In Cartoons,News as seen through the eyes of the nation's ...,https://assets.msn.com/labs/mind/AAJ7oYd.html,[],[]
67618,N9375,tv,tv-celebrity,"Woman, suspect dead at 'Tarzan' actor Ron Ely'...","Ron Ely, who portrayed Tarzan in the TV series...",https://assets.msn.com/labs/mind/AAIRnqy.html,"[{""Label"": ""Ron Ely"", ""Type"": ""P"", ""WikidataId...","[{""Label"": ""Ron Ely"", ""Type"": ""P"", ""WikidataId..."


In [14]:
behavior_data_test = pd.read_csv("data/MINDlarge_test/behaviors.tsv", header=None, sep='\t')
behavior_data_test.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

In [15]:
behavior_data_test.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U64099,11/19/2019 11:37:45 AM,N121133 N104200 N43255 N55860 N128965 N38014 N...,N101071 N15647 N83400 N124838 N57092 N64623 N6...
1,2,U231077,11/19/2019 5:28:08 AM,N45124 N84730 N45128 N104312 N70022 N99111 N26...,N14657 N51253 N49521 N126571 N74286 N101071 N1...
2,3,U606012,11/19/2019 4:46:23 AM,N59893 N84662 N90686 N33265 N127225 N120859 N6...,N74286 N9250 N26898 N123737 N98301 N80580 N456...
3,4,U320649,11/21/2019 6:03:51 AM,N110863 N7889 N86335 N85056 N115743 N63372 N19...,N119559 N37657 N108085 N91287 N39136 N130190 N...
4,5,U357840,11/22/2019 10:36:19 AM,N98596 N85005 N15713 N67779 N47961 N55571 N666...,N60658 N43496 N65220 N9125 N63136 N83728 N3208...


In [16]:
popular_today.index

Index(['N104737', 'N91597', 'N80126', 'N9375', 'N71977', 'N54360', 'N128965',
       'N1713', 'N45124', 'N128643'],
      dtype='object', name='history')

In [17]:

popular_today_ids = set(popular_today.index)

user_id_matches = []

for index, row in behavior_data_test.iterrows():
    if pd.isna(row["history"]):
        history = set()
    else:
        history = set(row["history"].split(" "))
    user_id_matches.append(len(history.intersection(popular_today_ids)) > 0)

sum(user_id_matches)


1181703

In [18]:
# Analyze NA values
behavior_data.isna().sum()

impression_id        0
user_id              0
timestamp            0
history          46065
impressions          0
dtype: int64

In [19]:
behavior_data.shape

(2232748, 5)

In [20]:
n_unique = behavior_data["user_id"].nunique()
print(f"Number of unique users: {n_unique}")

# Distribution of observations per user
user_obs = behavior_data.groupby("user_id").size()

# Create histogram of observations per user
fig = px.histogram(
    x=user_obs.values,
    log_y=True,
    nbins=50,
    title="Distribution of Number of Observations per User",
    labels={'x': 'Number of Observations', 'y': 'Count of Users'},
)
fig.update_layout(bargap=0.1)
fig.show()

Number of unique users: 711222


In [21]:
time_data = behavior_data.copy()
time_data['timestamp'] = pd.to_datetime(time_data['timestamp'])

In [22]:
# Group by day and count number of observations
daily_obs = time_data.groupby(time_data['timestamp'].dt.date).size()

# Create line plot of daily observations
fig = px.line(


    
    x=daily_obs.index,
    y=daily_obs.values,
    title="Daily Observations",
    labels={'x': 'Date', 'y': 'Number of Observations'},
)
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_tickfont=dict(size=12),
)

In [23]:
# group by hour
hourly_obs = time_data.groupby(time_data['timestamp'].dt.hour).size()
fig = px.bar(
    x=hourly_obs.index,
    y=hourly_obs.values,
    title="Hourly Observations",
    labels={'x': 'Hour', 'y': 'Number of Observations'},
)

fig.show()

In [24]:
# Distribution of amount of history per user
sample_size = min(10000, len(behavior_data))
sampled_data = behavior_data.sample(n=sample_size, random_state=42)

history_length = sampled_data['history'].str.split(' ').str.len()
fig = px.box(
    y=history_length,
    title="Distribution of History Length (10k sample)",
    labels={'y': 'History Length'},
)
fig.show()


In [25]:
# Distribution of amount of history per user
filtered_data = behavior_data[behavior_data['history'].str.split(' ').str.len() < 100]

sample_size = min(10000, len(filtered_data))
sampled_data = filtered_data.sample(n=sample_size, random_state=42)

history_length = sampled_data['history'].str.split(' ').str.len()
fig = px.box(
    y=history_length,
    title="Distribution of History Length (10k sample, <100 entries)",
    labels={'y': 'History Length'},
)
fig.show()
