# Exploring the arXiv Dataset

This notebook is used to generate the **pre-training** dataset.

In [1]:
import sys; sys.path.append('..')
import json
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split

from classifier.paths import data_folder

## What does the data look like?

In [None]:
blob = data_folder / "arxiv-metadata-oai-snapshot.jsonl"

example1 = None
with open(blob, 'r') as f:
  for line in f:
    example1 = json.loads(line)
    break

In [None]:
example1

## Pre-Training: Train/Test Split

- Total of `2,426,574` examples in the dataset
- Use 10% for testing
- Use 10% of the training dataset for validation

In [None]:
blob = data_folder / "arxiv-metadata-oai-snapshot.jsonl"

# Read one line at a time to avoid loading the whole thing into memory.
num_lines = 0
with open(blob, 'r') as f:
  for line in tqdm(f):
    num_lines += 1

print(f"There are {num_lines} examples in the full arXiv dataset")

In [None]:
print("Splitting the dataset")

TEST_SET_FRACTION = 0.1
VAL_SET_FRACTION = 0.1

train_idx, test_idx = train_test_split(torch.arange(0, num_lines), test_size=TEST_SET_FRACTION, random_state=42)
print("Training size:", len(train_idx))
print("Testing size:", len(test_idx))

train_idx, val_idx = train_test_split(train_idx, test_size=VAL_SET_FRACTION, random_state=42)
print("Validation size:", len(val_idx))

train_idx, val_idx, test_idx = set(train_idx.tolist()), set(val_idx.tolist()), set(test_idx.tolist())

train_data = []
val_data = []
test_data = []

print("Iterating through the dataset (slow)")
with open(blob, 'r') as f:
  for i, line in tqdm(enumerate(f), total=num_lines):
    data = json.loads(line)
    if i in train_idx:
      train_data.append(data)
    elif i in val_idx:
      val_data.append(data)
    elif i in test_idx:
      test_data.append(data)

print("DONE")

for split in ["train", "val", "test"]:
  print(f"Writing dataset split for '{split}'")
  with open(data_folder / "pretraining" / f"{split}.jsonl", 'w') as f:
    if split == "train":
      for example in train_data:
        f.write(json.dumps(example) + "\n")
    elif split == "val":
      for example in val_data:
        f.write(json.dumps(example) + "\n")
    elif split == "test":
      for example in test_data:
        f.write(json.dumps(example) + "\n")

print("DONE")

## Test Custom Dataset Iterator

Note that I did not ultimately use the iterator dataset.

In [None]:
import sys; sys.path.append("..")
from classifier.arxiv_dataset import load_dataset_splits
from classifier.paths import data_folder

In [None]:
dd = load_dataset_splits(
  data_folder / "finetuning" / "train.jsonl",
  data_folder / "finetuning" / "val.jsonl",
  data_folder / "finetuning" / "test.jsonl"
)

In [None]:
for split in ["train", "val"]:
  print(f"\nChecking dataset split '{split}'")
  num_true = 0
  num_false = 0
  for i, data in enumerate(dd[split]):
    if data['label'] == 'True':
      num_true += 1
    else:
      num_false += 1

  print(f"Percent True: {100 * num_true / (num_true + num_false):.2f}")
  print(f"Percent False: {100 * num_false / (num_true + num_false):.2f}")

## Training a Tokenizer for arXiv

In [2]:
import sys; sys.path.append('..')

from datasets import load_dataset
from classifier.paths import data_folder, models_folder

from tqdm import tqdm
from transformers import DistilBertTokenizerFast

In [3]:
corpus = load_dataset("json", data_files=str(data_folder / "pretraining" / "train.jsonl")).select_columns(["title", "abstract"])

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# Similar to the finetuning dataset, join the title and abstract together with a "." in between.
# You can create a new field and remove old ones in one step:
# https://huggingface.co/docs/datasets/en/process#map
corpus = corpus.map(lambda example: {"text": ". ".join([example["title"], example["abstract"]])}, remove_columns=["title", "abstract"])

Map:   0%|          | 0/1965524 [00:00<?, ? examples/s]

In [5]:
corpus

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1965524
    })
})

In [None]:
tokenizer_id="distilbert-base-uncased-arxiv"
model_id="distilbert/distilbert-base-uncased"

def batch_iterator(batch_size: int = 10000):
    """Iterate through the dataset in batches."""
    for i in tqdm(range(0, len(corpus["train"]), batch_size)):
        yield corpus["train"][i : i + batch_size]["text"]

# Create a tokenizer from existing one to re-use special tokens.
tokenizer = DistilBertTokenizerFast.from_pretrained(model_id, use_fast=True)

In [None]:
# Train the tokenizer and save it.
bert_tokenizer = tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=32_000)
bert_tokenizer.save_pretrained(models_folder / "tokenizers" / tokenizer_id)

## Pre-Processing the Dataset

In [1]:
from transformers import AutoTokenizer
import multiprocessing

import sys; sys.path.append('..')

from datasets import load_dataset
from classifier.paths import data_folder, models_folder

In [2]:
tokenizer_id="distilbert-base-uncased-arxiv-32k"

tokenizer = AutoTokenizer.from_pretrained(models_folder / "tokenizers" / tokenizer_id)
num_proc = multiprocessing.cpu_count()

# TODO(milo): Not sure why this isn't set during tokenizer training.
tokenizer.model_max_length = 512

print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")
tokenizer.encode_plus("This is a test")

The max length for the tokenizer is: 512


{'input_ids': [2, 202, 156, 43, 1244, 3], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [10]:
out = tokenizer("This is a test with some more words" * 100, truncation=True, return_overflowing_tokens=True)

len(out.input_ids)

2

In [None]:
def group_texts(examples):
  tokenized_inputs = tokenizer(
    examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
  )
  return tokenized_inputs

# Tokenize the dataset and remove the original text column.
tokenized_dataset = corpus.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)

In [None]:
tokenized_dataset

## How large does the context window need to be?

Based on the figure below, we only need a context window of size ~512 to fit most of the training examples.

In [9]:
tokenizer = AutoTokenizer.from_pretrained(models_folder / "tokenizers" / tokenizer_id)

print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")
tokenizer.encode_plus("This is a test")

NameError: name 'tokenizer_id' is not defined

In [11]:
n = 10000
examples = corpus["train"][:n]["text"]
context_lengths = [len(tokenizer.encode(text)) for text in examples]

Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors


In [16]:
import plotly.express as px

fig = px.histogram(x=context_lengths, title=f"Tokens Per Training Example (n={n})", labels={"x": "token length"})
fig.show()

## How many total tokens in the training set?

In [24]:
import torch
from datasets import load_from_disk
from tqdm import tqdm

In [12]:
tokenized = load_from_disk(data_folder / "pretraining" / "tokenized" / "bert-tiny-arxiv-32k-512")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1973364
    })
    val: Dataset({
        features: ['input_ids'],
        num_rows: 219290
    })
})

In [25]:
total = 0
for example in tqdm(tokenized["train"]):
  total += torch.Tensor(example["input_ids"]).count_nonzero()
print(f"The dataset has a total of {total} unmasked tokens")

100%|██████████| 1973364/1973364 [05:13<00:00, 6293.23it/s]

The dataset has a total of 391470421 unmasked tokens



