Connected to .venv (Python 3.13.7)

In [1]:
import pandas as pd
import numpy as np

In [2]:
filename = "pageviews-hourly-20251001-000000.gz"

df = pd.read_csv(
    filename,
    sep=" ",  # space-separated
    header=None,  # no header in file
    names=["project", "page_title", "views", "bytes"],  # column names
    dtype={"project": str, "page_title": str, "views": int, "bytes": int},
    compression="gzip",  # read directly from .gz
    engine="c",  # faster parsing
)
print(f"Total views: {df['views'].sum():,}")

Total views: 16,115,693


In [3]:
# --- Step 1: Keep only English articles --------------------------------------
df = df[df["project"] == "en"]
df.reset_index(drop=True)
df.insert(0, "row_number", range(1, len(df) + 1))
print(f"Filtered to {len(df)} English Wikipedia pages")
print(f"Total views: {df['views'].sum():,}")
print(df.head(10))

Filtered to 1184804 English Wikipedia pages
Total views: 3,092,228
        row_number project                                         page_title  \
520115           1      en                                                 !!   
520116           2      en                                                !!!   
520117           3      en                                           !DOCTYPE   
520118           4      en                                      !Xóõ_language   
520119           5      en                                              \&\""   
520120           6      en                                  \21_Azer\"_Medal"   
520121           7      en                                       \A\"_Device"   
520122           8      en                                 \A\"_Is_for_Alibi"   
520123           9      en  \Air\"_from_Johann_Sebastian_Bach's_Orchestral...   
520124          10      en                               \Awaken,_My_Love!\""   

        views  bytes  
520115      1     

In [4]:
# Save filtered English dataset
df.to_csv("pageviews_en.csv", index=False)
print("✅ Saved English-only dataset: pageviews_en.csv")

✅ Saved English-only dataset: pageviews_en.csv


In [5]:
# --- Step 2: Sample articles ---------------------------------
sample_fraction = 0.05  # adjust as needed (e.g., 0.05 = 5%)
df_sample = df.sample(frac=sample_fraction, random_state=42)
df_sample.sort_values(by="views", ascending=False, inplace=True)
print(df_sample)
print(f"Total keys in sample: {len(df_sample)}")
print(f"Total views in sample: {df_sample['views'].sum():,}")

         row_number project                     page_title  views  bytes
1069906      549792      en                    Jon_Stewart   2998      0
1512331      992217      en        Talk:Digital_humanities    504      0
1516988      996874      en            Talk:Saint_Boniface    408      0
1517356      997242      en  Talk:Somatic_symptom_disorder    372      0
737296       217182      en  Category:Redirects_from_moves    336      0
...             ...     ...                            ...    ...    ...
651360       131246      en                 Bare,_Kraljevo      1      0
749853       229739      en              Channa_(Buddhist)      1      0
1424654      904540      en                  Sandamarutham      1      0
719227       199113      en  Cassius_Clay_vs._Henry_Cooper      1      0
527447         7333      en                    1947_Tonies      1      0

[59240 rows x 5 columns]
Total keys in sample: 59240
Total views in sample: 146,068


In [6]:
df_sample.to_csv("pageviews_en_sample_5.csv", index=False)

In [7]:
# --- Step 4: Create one operation per view ------------------------------------
# Efficiently repeat rows by the 'views' column
ops = df_sample.loc[
    df_sample.index.repeat(df_sample["views"]), ["row_number", "page_title"]
].reset_index(drop=True)
print(f"Generated {len(ops):,} total operations (1 per view)")

Generated 146,068 total operations (1 per view)


In [8]:
# --- Step 5: Randomly turn 5% of lookups into updates --------------------------
n_ops = len(ops)
n_updates = int(0.05 * n_ops)

rng = np.random.default_rng(seed=42)
update_indices = rng.choice(n_ops, size=n_updates, replace=False)

ops["op_type"] = "lookup"
ops.loc[update_indices, "op_type"] = "update"
print(f"Assigned {n_updates:,} updates ({100*n_updates/n_ops:.2f}%)")

Assigned 7,303 updates (5.00%)


In [9]:
# --- Step 6: Shuffle operations -----------------------------------------------
ops = ops.sample(frac=1, random_state=42).reset_index(drop=True)
ops

Unnamed: 0,row_number,page_title,op_type
0,593820,Lafferty,lookup
1,41148,28_Years_Later,lookup
2,410947,Future_Imagery_Architecture,lookup
3,488054,Hugh_the_Great,lookup
4,443225,Greek_fire,lookup
...,...,...,...
146063,496559,I_Want_You_(Janet_Jackson_song),lookup
146064,1106490,Upazila_of_Bangladesh,lookup
146065,873146,Right_Round,lookup
146066,290674,David_Eberhardt,lookup


In [10]:
# --- Step 7: Change first occurrence per page_title to insert -----------------
# first_indices = ops.drop_duplicates(subset="page_title", keep="first").index
# ops.loc[first_indices, "op_type"] = "insert"
# print(f"Changed {len(first_indices):,} first accesses to inserts")

In [11]:
ops.to_csv("operations_en_sample_5.csv", index=False)
print("✅ Saved operations dataset: operations_en_sample_5.csv")
print(f"Contains {len(ops):,} operations (1 per view), with {n_updates:,} updates ({100*n_updates/n_ops:.2f}%)")

✅ Saved operations dataset: operations_en_sample_5.csv
Contains 146,068 operations (1 per view), with 7,303 updates (5.00%)


In [12]:
print((df_sample['views'] == 1).sum())

40670


In [13]:
# Compute string lengths
lengths = df_sample["page_title"].str.len()

# Calculate stats
avg_length = lengths.mean()
max_length = lengths.max()
min_length = lengths.min()
print(f"Page title lengths - Avg: {avg_length:.2f}, Max: {max_length}, Min: {min_length}")

Page title lengths - Avg: 20.54, Max: 236, Min: 1


In [16]:
updated_titles = ops.loc[ops["op_type"] == "update", "page_title"].unique()
print(f"Unique titles updated: {len(updated_titles)}")
print(updated_titles)

Unique titles updated: 5724
['Graham_Greene_(actor)' 'Dave_Reichert' 'Logan_(film_character)' ...
 'Cardinals_created_by_Benedict_XVI' 'Cross_of_Gold_speech'
 'Adam_James_(actor)']


In [17]:
update_counts = ops[ops["op_type"] == "update"]["page_title"].value_counts()
print(update_counts)

page_title
Jon_Stewart                       132
Talk:Digital_humanities            27
Talk:Saint_Boniface                24
Category:Redirects_from_moves      20
New_Relic                          16
                                 ... 
Einstürzende_Neubauten              1
French_canadian_keyboard            1
Jorge_Eduardo_Costilla_Sánchez      1
Stictonetta                         1
Adam_James_(actor)                  1
Name: count, Length: 5724, dtype: int64


In [18]:
num_one_update = (update_counts == 1).sum()
print(f"Number of page_titles with exactly one update: {num_one_update}")

Number of page_titles with exactly one update: 4950
