In [1]:
import os 
os.chdir('..')

%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import math
from collections import defaultdict
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("../datasets/babylm_10M/").glob("*.train")]
paths

['../datasets/babylm_10M/bnc_spoken.train',
 '../datasets/babylm_10M/children_stories.train',
 '../datasets/babylm_10M/cbt.train',
 '../datasets/babylm_10M/switchboard.train',
 '../datasets/babylm_10M/wikipedia.train',
 '../datasets/babylm_10M/gutenberg.train',
 '../datasets/babylm_10M/aochildes.train',
 '../datasets/babylm_10M/qed.train',
 '../datasets/babylm_10M/simple_wikipedia.train',
 '../datasets/babylm_10M/open_subtitles.train']

In [3]:
from transformers import GPT2Tokenizer

In [4]:
%%time 

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])




CPU times: user 36.1 s, sys: 2.62 s, total: 38.8 s
Wall time: 3.38 s


In [5]:
# !mkdir gpt2_baseline
tokenizer.save_model("./gpt2_baseline")

['./gpt2_baseline/vocab.json', './gpt2_baseline/merges.txt']

In [6]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_baseline/")

In [7]:
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})


0

In [8]:
tokenizer("Hello world")

{'input_ids': [2230, 1019], 'attention_mask': [1, 1]}

In [9]:
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)


In [10]:
model = GPT2LMHeadModel(config)

In [11]:
model.num_parameters()

125778432

In [12]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="../datasets/babylm_10M_merged.train",
    block_size=128,
)



CPU times: user 58.8 s, sys: 1.26 s, total: 1min
Wall time: 58.9 s


In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_baseline",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    num_train_epochs=10,
    save_steps=10000,
    save_total_limit=10,
    seed=12,
#     evaluate_during_training=True,
    prediction_loss_only=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [15]:
%%time
trainer.train()

***** Running training *****
  Num examples = 1015503
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 158680
  Number of trainable parameters = 125778432


Step,Training Loss
500,7.3884
1000,6.7925
1500,6.6184
2000,6.5239


KeyboardInterrupt: 

In [None]:
trainer.save_model("./gpt2_baseline")