# Wikipedia

Sample code for loading the wikipedia dataset. Note: the full dataset has about $6.4M$ entries.

In [None]:
from datasets import load_dataset
from itertools import islice
import random
import sys

### Option 1 - Loading first K rows

In [None]:
k = 1000
streamed_wikipedia = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
wiki_sample = list(islice(streamed_wikipedia, k))

If you want to check approximate memory usage:

In [None]:
size_gb = sum([sys.getsizeof(page['text']) for page in wiki_sample])/1e9
print(f"The size of the text only is: {size_gb} GBs")

### Option 2 - Loading random subsample

I learned this trick from chatgpt, and quite like it! It is called "reservoir sampling". It takes longer but does not exhaust your memory.

In [None]:
# reservoir sample algorithm. samples from online streaming (not batch) ensuring at the end each data point has same prob.
def reservoir_sample(dataset_stream, k, nmax, seed=None):
    if seed is not None:
        random.seed(seed)
    reservoir = []
    for i, example in enumerate(dataset_stream):
        if i < nmax:
            if i < k:
                reservoir.append(example)
            else:
                j = random.randint(0, i)
                if j < k:
                    reservoir[j] = example
        else:
            print(f"You've reached the maximum of {nmax}!")
            break
    return reservoir

In [None]:
# get dataset but not all in one batch, so streaming, then reservoir sample
# k is the size of the actual sample, nmax is how far down the dataset it samples. if you wanted to go over the whole dataset
    # you can set it to around 6.4M but I don't know how long it will take. Colab in particular may be significantly slow

k = 2500
nmax = 10000 #max=10K works well
seed = 42
streamed_wikipedia = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
wiki_sample = reservoir_sample(streamed_wikipedia, k=k, nmax=nmax, seed=seed)

If you want to check approximate memory usage:

In [None]:
size_gb = sum([sys.getsizeof(page['text']) for page in wiki_sample])/1e9
print(f"The size of the text only is: {size_gb} GBs")