In [24]:
import pandas as pd
import ast

# df = pd.read_csv('dataset/aol4foltr/metadata.csv', parse_dates=['timestamp'], dtype={'user_id': str})
df = pd.read_csv('dataset/aol_dataset_top10000.csv', parse_dates=['time'], low_memory=False)
df['candidate_doc_ids'] = df['candidate_doc_ids'].apply(ast.literal_eval)

# Factsheet

In [26]:
# Basic statistics about the dataset
print(f"Number of query logs: {len(df):,}")
print(f"Number of unique queries: {df['query'].nunique():,}")
print(f"Number of unique clicked documents: {df['doc_id'].nunique():,}") 
print(f"Number of unique candidate documents: {len(set().union(*df['candidate_doc_ids'])):,}")
print(f"Number of unique users: {df['user_id'].nunique():,}")


Number of query logs: 2,594,705
Number of unique queries: 637,996
Number of unique clicked documents: 428,157
Number of unique candidate documents: 1,294,562
Number of unique users: 10,000


# Top-10 Users

In [3]:
# Get top 10 users by number of queries
user_stats = df.groupby('user_id').agg({
    'query': ['count', 'nunique']
}).sort_values(('query', 'count'), ascending=False).head(10)

# Format the output string
for user_id, (total_queries, unique_queries) in user_stats.iterrows():
    print(f"User {user_id}: {total_queries:,} queries ({unique_queries:,} unique)")

User 71845: 76,432 queries (33,497 unique)
User 2263543: 3,626 queries (428 unique)
User 137248: 2,985 queries (107 unique)
User 3318459: 2,781 queries (388 unique)
User 497336: 2,427 queries (97 unique)
User 42075: 2,338 queries (288 unique)
User 22661144: 2,123 queries (118 unique)
User 1306380: 2,105 queries (310 unique)
User 1901902: 2,097 queries (140 unique)
User 1308574: 2,032 queries (60 unique)


In [9]:
# Get number of queries for 100th most active user
user_query_counts = df.groupby('user_id')['query'].count().sort_values(ascending=False)
user_100 = user_query_counts.iloc[99]  # 0-based indexing
print(f"The 100th most active user made {user_100:,} queries")


Top users account for 26.5% of all queries
The 10000th user has 146 queries
