# Exploring the arXiv Dataset

This notebook is used to generate a **pre-training train/test/validation split.**

The format expected by HuggingFace looks like:
```python
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})
```

In [None]:
import sys; sys.path.append('..')
import json
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split

from classifier.paths import data_folder

## What does the data look like?

In [None]:
blob = data_folder / "arxiv-metadata-oai-snapshot.jsonl"

example1 = None
with open(blob, 'r') as f:
  for line in f:
    example1 = json.loads(line)
    break

In [None]:
example1

## Pre-Training: Train/Test Split

- Total of `2,426,574` examples in the dataset
- Use 10% for testing
- Use 20% of the training dataset for validation

In [None]:
blob = data_folder / "arxiv-metadata-oai-snapshot.jsonl"

# Read one line at a time to avoid loading the whole thing into memory.
num_lines = 0
with open(blob, 'r') as f:
  for line in tqdm(f):
    num_lines += 1

print(f"There are {num_lines} examples in the full arXiv dataset")

In [None]:
print("Splitting the dataset")

TEST_SET_FRACTION = 0.1
VAL_SET_FRACTION = 0.1

train_idx, test_idx = train_test_split(torch.arange(0, num_lines), test_size=TEST_SET_FRACTION, random_state=42)
print("Training size:", len(train_idx))
print("Testing size:", len(test_idx))

train_idx, val_idx = train_test_split(train_idx, test_size=VAL_SET_FRACTION, random_state=42)
print("Validation size:", len(val_idx))

train_idx, val_idx, test_idx = set(train_idx.tolist()), set(val_idx.tolist()), set(test_idx.tolist())

train_data = []
val_data = []
test_data = []

print("Iterating through the dataset (slow)")
with open(blob, 'r') as f:
  for i, line in tqdm(enumerate(f), total=num_lines):
    data = json.loads(line)
    if i in train_idx:
      train_data.append(data)
    elif i in val_idx:
      val_data.append(data)
    elif i in test_idx:
      test_data.append(data)

print("DONE")

for split in ["train", "val", "test"]:
  print(f"Writing dataset split for '{split}'")
  with open(data_folder / "pretraining" / f"{split}.jsonl", 'w') as f:
    if split == "train":
      for example in train_data:
        f.write(json.dumps(example) + "\n")
    elif split == "val":
      for example in val_data:
        f.write(json.dumps(example) + "\n")
    elif split == "test":
      for example in test_data:
        f.write(json.dumps(example) + "\n")

print("DONE")

## Iterable Dataset

https://huggingface.co/docs/datasets/en/create_iterable_dataset

In [None]:
from datasets import IterableDataset

data_path = data_folder / "arxiv-metadata-oai-snapshot.jsonl"

def gen():
  with open(data_path, 'r') as f:
    for line in f:
      yield json.loads(line)


ds = IterableDataset.from_generator(gen)
for example in ds:
  print(example)
  break

## Test Custom Dataset Iterator

In [1]:
import sys; sys.path.append("..")
from classifier.arxiv_dataset import load_dataset_splits
from classifier.paths import data_folder

from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
dd = load_dataset_splits(
  data_folder / "finetuning" / "train.jsonl",
  data_folder / "finetuning" / "val.jsonl",
  data_folder / "finetuning" / "test.jsonl"
)

In [6]:
# Alternatively:
from datasets import load_dataset
# dd = load_dataset("json", data_files={
#   "train": str(data_folder / "finetuning" / "train.jsonl"),
#   "val": str(data_folder / "finetuning" / "val.jsonl"),
#   # "test": str(data_folder / "finetuning" / "test.jsonl")
# })

dd = load_dataset("json", data_files={
  "train": str(data_folder / "pretraining" / "train.jsonl"),
  "val": str(data_folder / "pretraining" / "val.jsonl"),
  "test": str(data_folder / "pretraining" / "test.jsonl")
}).select_columns(["title", "abstract"])

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [None]:
for split in ["train", "val"]:
  print(f"\nChecking dataset split '{split}'")
  num_true = 0
  num_false = 0
  for i, data in enumerate(dd[split]):
    if data['label'] == 'True':
      num_true += 1
    else:
      num_false += 1

  print(f"Percent True: {100 * num_true / (num_true + num_false):.2f}")
  print(f"Percent False: {100 * num_false / (num_true + num_false):.2f}")

In [None]:
num_labels = 2
num_train_epochs = 10
model_name = 'albert-base-v2'
run_name = "debugging"

id2label = {0: "False", 1: "True"}
label2id = {"False": 0, "True": 1}

tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = load_dataset_splits(
  data_folder / "finetuning" / "train.jsonl",
  data_folder / "finetuning" / "val.jsonl",
  data_folder / "finetuning" / "test.jsonl",
)

def convert_labels(examples: dict[str, list[int | str]]):
  """Convert the `label` field to a numeric value (it's "True" or "False" in the raw data)."""
  return {"label": [{"True": 1, "False": 0}[label] for label in examples["label"]]}

def tokenize(examples: dict[str, list[int | str]]):
  """Tokenize the `text` field of all examples."""
  return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset = dataset.map(convert_labels, batched=True)
dataset = dataset.map(tokenize, batched=True)

In [None]:
for i, example in enumerate(dataset['train']):
  print(example)
  
  if i > 100:
    break