In [1]:
# https://jalammar.github.io/illustrated-gpt2/#part-3-beyond-language-modeling

In [333]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments

checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = GPT2LMHeadModel.from_pretrained(checkpoint)

In [260]:
def init_vocab_stat(tokenizer, voc_size_sample=20):
    initial_vocab = list(tokenizer.vocab.keys())
    
    print(f"Vocabulary length:\033[1m{len(initial_vocab)}\033[0m\n")
    print(f"Special tokens: {tokenizer.all_special_tokens}\n")
    print(f"Loaded vocabulary tokens, samples:\n{initial_vocab[:voc_size_sample]}")
    

def viz_sentence_tokens(raw_inputs, examples=-1):
    inputs = tokenizer(raw_inputs, truncation=True, return_overflowing_tokens=True,return_length=True)#, return_tensors="pt")
    examples = len(raw_inputs) if examples == -1 else examples
    for i, raw, processed in zip(range(examples), raw_inputs, inputs["input_ids"]):
        print("\n")
        print(f"Example:{i}")
        print("-----------------------------------------------------------------------------")
        print(raw)
        print("-----------------------------------------------------------------------------")
        print(tokenizer.tokenize(raw))
        print("-----------------------------------------------------------------------------")
        print(tokenizer.decode(processed))
        print("\n")
        
    for sentense in raw_inputs:
        tokens = tokenizer.tokenize(sentense)
        if tokenizer.unk_token in tokens:
            print("Simple [UNK] check:")
            print(sentense)
            print(tokens)
            
def viz_not_existing_tokens(raw_inputs, examples=-1):
    initial_vocab = list(tokenizer.vocab.keys())
    examples = len(raw_inputs) if examples == -1 else examples
    inputs = tokenizer(raw_inputs, truncation=True, return_overflowing_tokens=True,return_length=True)
    for i, sentense in zip(range(examples), raw_inputs):
        print(f"+++++++++++++++++++++++++++++ sentence number {i} +++++++++++++++++++++++++++++")
        print(f"The sentence: {sentense}")
        not_existing = []
        for word in sentense.split():
            if word not in initial_vocab: 
                not_existing.append(word)
        print(f"Not existing words: {not_existing}")
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        print("\n")
                
def add_tokens(additional_tokens, model):
    tokenizer.add_tokens(additional_tokens)
    # Additionl rows are added at the end of the embeddings
    # The initialization is like in "torch.nn.Embeddin" - normal(0,1)
    model.resize_token_embeddings(len(tokenizer))
    return tokenizer, model

def print_model_arch(model):
    print([module for module in model.modules()])
    
def print_model_total_params(model):
    total_parmeters = sum(p.numel() for p in model.parameters())
    print(f"Parameters number(Including embeddings): {total_parmeters /1000**2}M")


In [301]:
#Base on samples from https://www.kaggle.com/jeet2016/us-financial-news-articles
raw_inputs_train = ["UK's Compass says new CRCC Co. CEO to start Jan 1 after death of Cousins - Reuters",
                    "scenes from deadly protests in Iran STR | AFP | Getty Images 5 Mins Ago Unrest in Iran"]
raw_inputs_valid = ["Tesla delivers 1,550 Model 3 sedans and 29,870 total vehicles in fourth quarter"]

In [None]:
init_vocab_stat(tokenizer, voc_size_sample=30)
viz_sentence_tokens(raw_inputs, examples=-1)
#viz_not_existing_tokens(raw_inputs, examples=-1)
#print_model_arch(model)
#print_model_total_params(model)

In [321]:
additional_tokens = ["CRCC", "seden"]
tokenizer, model = add_tokens(additional_tokens, model)
df1 = pd.DataFrame(raw_inputs_train, columns=["content"])
df2 = pd.DataFrame(raw_inputs_valid, columns=["content"])
raw_dataset_train = Dataset.from_pandas(df1)
raw_dataset_valid = Dataset.from_pandas(df1)
raw_datasets = DatasetDict({"train": raw_dataset_train, "valid": raw_dataset_valid})

In [339]:
def tokenize(element):
    context_length = 9
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

# Noteice that the addition of the new words are ignored! when retokenized the dataset by the modle tokenized!!!
tokenized_datasets = raw_datasets.map(tokenize, batched=True, remove_columns=raw_datasets["train"].column_names)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
tokenized_datasets

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 154.89ba/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 273.37ba/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 4
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 4
    })
})

In [361]:

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=1,#32,
    per_device_eval_batch_size=1,#32,
    evaluation_strategy="steps",
    eval_steps=1,#5_000,
    logging_steps=1,#,5_000,
    gradient_accumulation_steps=1,#8,
    num_train_epochs=2,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=2#5_000,
    #fp16=True
)

trainer = Trainer(model=model,
                  tokenizer=tokenizer,
                  args=args,
                  data_collator=data_collator,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["valid"])

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 4
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 8


Step,Training Loss,Validation Loss
1,6.9441,6.675638


***** Running Evaluation *****
  Num examples = 4
  Batch size = 1
