In [1]:
%%capture
!pip install transformers
!pip install tokenizers
!pip install datasets

In [2]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer , AutoModelForCausalLM
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'gpt2'

In [3]:
from datasets import load_dataset
dataset = load_dataset("wikitext" ,'wikitext-103-raw-v1' )

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-103-raw-v1 to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/192M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [5]:
def get_training_corpus():
  raw_dataset = dataset['train']
  for start_idx  in range(0 , len(raw_dataset) , 1000):
    samples = raw_dataset[start_idx : start_idx +1000]
    yield samples['text']

training_corpus = get_training_corpus()

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
wikitext_tokenizer = tokenizer.train_new_from_iterator(training_corpus ,52000 )

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
wikitext_tokenizer.push_to_hub('wikitext-tokenizer')

CommitInfo(commit_url='https://huggingface.co/mahmoudNG/wikitext-tokenizer/commit/ce9a1ff77ab5c16f33d0700a9821512ca827dfb5', commit_message='Upload tokenizer', commit_description='', oid='ce9a1ff77ab5c16f33d0700a9821512ca827dfb5', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
from transformers import AutoTokenizer

context_length = 128 
tokenizer = AutoTokenizer.from_pretrained("mahmoudNG/wikitext-tokenizer")

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = dataset.map(
    tokenize, batched=True, remove_columns=dataset['train'].column_names
)
tokenized_datasets

Downloading (…)okenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/856k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/503k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1802 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids'],
        num_rows: 1043
    })
    train: Dataset({
        features: ['input_ids'],
        num_rows: 455105
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 927
    })
})

In [11]:
from transformers import AutoTokenizer , GPT2LMHeadModel , AutoConfig 

config = AutoConfig.from_pretrained(
    'gpt2' , 
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,

)

In [12]:
model = GPT2LMHeadModel(config)

In [13]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer , mlm =False)

In [14]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="wikitext-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

Cloning https://huggingface.co/mahmoudNG/wikitext-ds into local empty directory.
Using cuda_amp half precision backend


In [15]:
trainer.train()

***** Running training *****
  Num examples = 455105
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 1777
  Number of trainable parameters = 125778432
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1777, training_loss=5.300361388576252, metrics={'train_runtime': 5607.6164, 'train_samples_per_second': 81.158, 'train_steps_per_second': 0.317, 'total_flos': 2.9716220215296e+16, 'train_loss': 5.300361388576252, 'epoch': 1.0})

In [16]:
trainer.push_to_hub()

Saving model checkpoint to wikitext-ds
Configuration saved in wikitext-ds/config.json
Configuration saved in wikitext-ds/generation_config.json
Model weights saved in wikitext-ds/pytorch_model.bin
tokenizer config file saved in wikitext-ds/tokenizer_config.json
Special tokens file saved in wikitext-ds/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 32.0k/492M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 3.37k/3.37k [00:00<?, ?B/s]

Upload file runs/Feb22_10-19-04_56cf9a685559/events.out.tfevents.1677061158.56cf9a685559.1128.0: 100%|########…

Upload file runs/Feb22_10-19-04_56cf9a685559/1677061158.1891751/events.out.tfevents.1677061158.56cf9a685559.11…

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/mahmoudNG/wikitext-ds
   0784b0c..14fa55d  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/mahmoudNG/wikitext-ds
   0784b0c..14fa55d  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'dataset': {'name': 'wikitext', 'type': 'wikitext', 'config': 'wikitext-103-raw-v1', 'split': 'validation', 'args': 'wikitext-103-raw-v1'}}
To https://huggingface.co/mahmoudNG/wikitext-ds
   14fa55d..daffc22  main -> main

   14fa55d..daffc22  main -> main



'https://huggingface.co/mahmoudNG/wikitext-ds/commit/14fa55dc66c51d5346c5a2f0993f8fb93f01aeca'