Connected to .venv (Python 3.13.7)

In [1]:
import pandas as pd
import numpy as np

In [2]:
filename = "pageviews-hourly-20251001-000000.gz"

df = pd.read_csv(
    filename,
    sep=" ",  # space-separated
    header=None,  # no header in file
    names=["project", "page_title", "views", "bytes"],  # column names
    dtype={"project": str, "page_title": str, "views": int, "bytes": int},
    compression="gzip",  # read directly from .gz
    engine="c",  # faster parsing
)
print(f"Total views: {df['views'].sum():,}")

Total views: 16,115,693


In [3]:
# --- Step 1: Keep only English articles --------------------------------------
df = df[df["project"] == "en"]
df.reset_index(drop=True)
df.insert(0, "row_number", range(1, len(df) + 1))
print(f"Filtered to {len(df)} English Wikipedia pages")
print(f"Total views: {df['views'].sum():,}")
print(df.head(10))

Filtered to 1184804 English Wikipedia pages
Total views: 3,092,228
        row_number project                                         page_title  \
520115           1      en                                                 !!   
520116           2      en                                                !!!   
520117           3      en                                           !DOCTYPE   
520118           4      en                                      !Xóõ_language   
520119           5      en                                              \&\""   
520120           6      en                                  \21_Azer\"_Medal"   
520121           7      en                                       \A\"_Device"   
520122           8      en                                 \A\"_Is_for_Alibi"   
520123           9      en  \Air\"_from_Johann_Sebastian_Bach's_Orchestral...   
520124          10      en                               \Awaken,_My_Love!\""   

        views  bytes  
520115      1     

In [4]:
# Save filtered English dataset
df.to_csv("pageviews_en.csv", index=False)
print("✅ Saved English-only dataset: pageviews_en.csv")

✅ Saved English-only dataset: pageviews_en.csv


In [6]:
# --- Step 2: Sample articles ---------------------------------
sample_fraction = 0.04  # adjust as needed (e.g., 0.05 = 5%)
df_sample = df.sample(frac=sample_fraction, random_state=42)
df_sample.sort_values(by="views", ascending=False, inplace=True)
print(df_sample)
print(f"Total keys in sample: {len(df_sample)}")
print(f"Total views in sample: {df_sample['views'].sum():,}")

         row_number project                             page_title  views  \
1069906      549792      en                            Jon_Stewart   2998   
1512331      992217      en                Talk:Digital_humanities    504   
1516988      996874      en                    Talk:Saint_Boniface    408   
1517356      997242      en          Talk:Somatic_symptom_disorder    372   
737296       217182      en          Category:Redirects_from_moves    336   
...             ...     ...                                    ...    ...   
1426209      906095      en                          Santanaraptor      1   
1553406     1033292      en                    The_Mystery_of_Time      1   
725328       205214      en  Category:Bayelsa_United_F.C._managers      1   
1425917      905803      en               Santa_Fe_Group_(geology)      1   
1231870      711756      en                            Millî_Görüş      1   

         bytes  
1069906      0  
1512331      0  
1516988      0  
1517356

In [7]:
df_sample.to_csv(f"pageviews_en_sample_{int(sample_fraction*100)}.csv", index=False)

In [2]:
# Start from here to generate operations but keep the 5% sample.
df_sample = pd.read_csv("pageviews_en_sample_5.csv")
print(df_sample.head())
print(f"Loaded {len(df_sample):,} rows.")

   row_number project                     page_title  views  bytes
0      549792      en                    Jon_Stewart   2998      0
1      992217      en        Talk:Digital_humanities    504      0
2      996874      en            Talk:Saint_Boniface    408      0
3      997242      en  Talk:Somatic_symptom_disorder    372      0
4      217182      en  Category:Redirects_from_moves    336      0
Loaded 59,240 rows.


In [3]:
# --- Step 4: Create one operation per view ------------------------------------
# Efficiently repeat rows by the 'views' column
ops = df_sample.loc[
    df_sample.index.repeat(df_sample["views"]), ["row_number", "page_title"]
].reset_index(drop=True)
print(f"Generated {len(ops):,} total operations (1 per view)")

Generated 146,068 total operations (1 per view)


In [21]:
# --- Step 5: Randomly turn 100% of lookups into updates --------------------------
n_ops = len(ops)
updates_fraction = 1.0  # 100% updates
n_updates = int(updates_fraction * n_ops)

rng = np.random.default_rng(seed=42)
update_indices = rng.choice(n_ops, size=n_updates, replace=False)

ops["op_type"] = "lookup"
ops.loc[update_indices, "op_type"] = "update"
print(f"Assigned {n_updates:,} updates ({100*n_updates/n_ops:.2f}%)")

Assigned 146,068 updates (100.00%)


In [22]:
# --- Step 6: Shuffle operations -----------------------------------------------
ops = ops.sample(frac=1, random_state=42).reset_index(drop=True)
ops

Unnamed: 0,row_number,page_title,op_type
0,61906,Adria_Arjona,update
1,532228,Jeep_Wagoneer_S,update
2,848682,Quinten_Timber,update
3,334830,Edward_Scissorhands,update
4,61275,Administrative_division,update
...,...,...,...
146063,1031454,The_Lucy–Desi_Comedy_Hour,update
146064,801167,Patrilineality,update
146065,886139,Roth_IRA,update
146066,69682,Al-Qassam_Brigades,update


In [10]:
# --- Step 7: Change first occurrence per page_title to insert -----------------
# first_indices = ops.drop_duplicates(subset="page_title", keep="first").index
# ops.loc[first_indices, "op_type"] = "insert"
# print(f"Changed {len(first_indices):,} first accesses to inserts")

In [23]:
ops.to_csv(
    f"operations_en_sample_5_{int(updates_fraction * 100)}.csv",
    index=False,
)
print(f"✅ Saved operations dataset: operations_en_sample_5_{int(updates_fraction * 100)}.csv")
print(f"Contains {len(ops):,} operations (1 per view), with {n_updates:,} updates ({100*n_updates/n_ops:.2f}%)")

✅ Saved operations dataset: operations_en_sample_5_100.csv
Contains 146,068 operations (1 per view), with 146,068 updates (100.00%)


In [12]:
print((df_sample['views'] == 1).sum())

40670


In [13]:
# Compute string lengths
lengths = df_sample["page_title"].str.len()

# Calculate stats
avg_length = lengths.mean()
max_length = lengths.max()
min_length = lengths.min()
print(f"Page title lengths - Avg: {avg_length:.2f}, Max: {max_length}, Min: {min_length}")

Page title lengths - Avg: 20.54, Max: 236, Min: 1


In [39]:
# Load operations from file if needed 
ops = pd.read_csv("operations_en_sample_5_100.csv")

In [40]:
updated_titles = ops.loc[ops["op_type"] == "update", "page_title"].unique()
print(f"Unique titles updated: {len(updated_titles)}")

Unique titles updated: 59240


In [25]:
update_counts = ops[ops["op_type"] == "update"]["page_title"].value_counts()
print(update_counts)

page_title
Jon_Stewart                                                 2998
Talk:Digital_humanities                                      504
Talk:Saint_Boniface                                          408
Talk:Somatic_symptom_disorder                                372
Category:Redirects_from_moves                                336
                                                            ... 
Clionella_halistrepta                                          1
2011-12_West_Virginia_Mountaineers_men's_basketball_team       1
BlackBerry_Electron                                            1
Georgia_Central_Railway                                        1
Comte_de_Rochefort                                             1
Name: count, Length: 59240, dtype: int64


In [26]:
num_one_update = (update_counts == 1).sum()
print(f"Number of page_titles with exactly one update: {num_one_update}")

Number of page_titles with exactly one update: 40670
