In [24]:
import pandas as pd
import ast

# df = pd.read_csv('dataset/aol4foltr/metadata.csv', parse_dates=['timestamp'], dtype={'user_id': str})
df = pd.read_csv('dataset/aol_dataset_top10000.csv', parse_dates=['time'], low_memory=False)
df['candidate_doc_ids'] = df['candidate_doc_ids'].apply(ast.literal_eval)

# Factsheet

In [26]:
# Basic statistics about the dataset
print(f"Number of query logs: {len(df):,}")
print(f"Number of unique queries: {df['query'].nunique():,}")
print(f"Number of unique clicked documents: {df['doc_id'].nunique():,}") 
print(f"Number of unique candidate documents: {len(set().union(*df['candidate_doc_ids'])):,}")
print(f"Number of unique users: {df['user_id'].nunique():,}")


Number of query logs: 2,594,705
Number of unique queries: 637,996
Number of unique clicked documents: 428,157
Number of unique candidate documents: 1,294,562
Number of unique users: 10,000


# Top-10 Users

In [3]:
# Get top 10 users by number of queries
user_stats = df.groupby('user_id').agg({
    'query': ['count', 'nunique']
}).sort_values(('query', 'count'), ascending=False).head(10)

# Format the output string
for user_id, (total_queries, unique_queries) in user_stats.iterrows():
    print(f"User {user_id}: {total_queries:,} queries ({unique_queries:,} unique)")

User 71845: 76,432 queries (33,497 unique)
User 2263543: 3,626 queries (428 unique)
User 137248: 2,985 queries (107 unique)
User 3318459: 2,781 queries (388 unique)
User 497336: 2,427 queries (97 unique)
User 42075: 2,338 queries (288 unique)
User 22661144: 2,123 queries (118 unique)
User 1306380: 2,105 queries (310 unique)
User 1901902: 2,097 queries (140 unique)
User 1308574: 2,032 queries (60 unique)


In [9]:
# Get number of queries for 100th most active user
user_query_counts = df.groupby('user_id')['query'].count().sort_values(ascending=False)
user_100 = user_query_counts.iloc[99]  # 0-based indexing
print(f"The 100th most active user made {user_100:,} queries")


Top users account for 26.5% of all queries
The 10000th user has 146 queries


In [27]:
import pandas as pd
import argparse
from tqdm import tqdm
import ir_datasets
from pyserini.search.lucene import LuceneSearcher
import random
from collections import defaultdict

dataset = ir_datasets.load("aol-ia")
searcher = LuceneSearcher('indexes/docs_jsonl')

docs_store = dataset.docs_store()

### FILTER QLOGS ###

qlogs = []
for qlog in tqdm(dataset.qlogs_iter(), total=dataset.qlogs_count(), desc="Prepare qlogs"):
    if qlog.query.strip() == '':
        continue
    if len(qlog.items) != 1:
        continue
    qlogs.append({
        'query_id': qlog.query_id,
        'query': qlog.query.strip().lower(),
        'timestamp': qlog.time,
        'user_id': qlog.user_id,
        'target_doc_id': qlog.items[0].doc_id,
    })

qlogs = pd.DataFrame(qlogs)
user_query_counts = qlogs.groupby('user_id').size()
top_users = user_query_counts.nlargest(10000).index
qlogs = qlogs[qlogs['user_id'].isin(top_users)]
qlogs.head()

Jun 19, 2025 10:09:28 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false
Prepare qlogs: 100%|██████████| 36389567/36389567 [01:54<00:00, 318779.20it/s]


Unnamed: 0,query_id,query,timestamp,user_id,target_doc_id
261,9e9c455e851de2,charles drew,2006-03-01 17:35:12,2722,18b0cc72ef4d
262,9e9c455e851de2,charles drew,2006-03-01 18:00:07,2722,8452d1a7d388
263,9e9c455e851de2,charles drew,2006-03-01 18:00:07,2722,6f90153166ac
264,fb70fb6a6ee0eb,military rental benefits,2006-03-10 09:32:38,2722,84fe02c8d57f
265,60ba5a07105e5c,military car rental benefits,2006-03-10 09:33:37,2722,84fe02c8d57f


In [32]:
qlogs.groupby('query_id')['target_doc_id'].agg(set).to_dict()


{'000005ef762f34': {'6638d7f805a5', '790fc930c640', 'b065e903b5c2'},
 '00001127bccd77': {'1fbd8927a9f5'},
 '00001889153205': {'210a82317638', '54c33d804fb4'},
 '00001d00578d58': {'0983ccd66799',
  '5f34d4333925',
  'bb97b5282602',
  'f2e67c5b463b',
  'f79034658072',
  'ff4d831cc42f'},
 '00001f0d8dd96b': {'1287d5f90d41', '3586260a387c', '79192ab2c664'},
 '0000340b2108e3': {'64aca26ff4f3', '7c1bf19ece14', 'ac09ad947c12'},
 '0000368518144b': {'0c94b6eac90e',
  '5aa9210ac49e',
  'a88e535320e1',
  'c8c50bcae456'},
 '00003dfca0f2bf': {'6cf14dff6143'},
 '0000486f2e48f1': {'75747b93a604'},
 '00004a4996a65b': {'9d8d069bb67b', 'f1ae8399ab7e'},
 '000056393ae7f2': {'26b4db52dce4'},
 '000058bdeaa281': {'0287ce64cbf6', '994d3a594fd3'},
 '00005df2ce4dc9': {'1cd0fd9d2306'},
 '0000703e027e65': {'2ce497de82fe', '7a28da8caf35', 'efb1c9484d1c'},
 '00007073f98764': {'9d6f1ddf4205'},
 '0000b3e0e27362': {'0d7922055d30'},
 '0000db139bab3e': {'b1c457606846'},
 '0000dc41f7ce5c': {'41f4337edd0a',
  'd013103c32f5