In [1]:
# !pip install transformers -U
# !pip install datasets
# !pip install nvidia-ml-py3 
# !pip install humanize
# !pip install torch -U 
# !pip install transformers[sentencepiece]
# !pip install -q git+https://github.com/gmihaila/ml_things.git
# !pip list | grep -E 'transformers|tokenizers'

In [2]:
import os
import re
os.makedirs("my-xlnet-model", exist_ok=True)

In [3]:
os.environ["WANDB_MODE"] = 'offline'
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from pathlib import Path
paths = ['../../wiki.txt']
# paths = ['../input/idwikitext/wiki.txt']

In [5]:
from datasets import load_dataset
datasets = load_dataset("text", data_files={"train": paths})

Using custom data configuration default-bbccf846be071e24
Reusing dataset text (C:\Users\Acer\.cache\huggingface\datasets\text\default-bbccf846be071e24\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
from transformers import XLNetTokenizer
# tokenizer = XLNetTokenizer(
#     "../input/spmmodel/spm.uncased.test.model",
#     do_lower_case=True
# )
tokenizer = XLNetTokenizer(
    "../../spm.v1.uncased.model",
    do_lower_case=True
)
# tokenizer = XLNetTokenizer.from_pretrained("../input/xlnet-gpu/my-xlnet-model")

In [7]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
def tokenize_function(examples):
    # Remove empty lines
    examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

In [9]:
tokenized_datasets = datasets.map(tokenize_function, batched=False, num_proc=4, remove_columns=["text"])

    

NameError: name 'tokenizer' is not defined

In [None]:
# block_size = tokenizer.model_max_length
block_size = 32

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder,
    # we could add padding if the model supported it instead of this drop,
    # you can customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    )

In [None]:
from transformers import AutoConfig, XLNetLMHeadModel

config = AutoConfig(
    n_layer=12,
    d_model=768,
    n_head=12,
    d_inner=4096,
    dropout=0.1,
    dropatt=0.1,
    bi_data=True,
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    cutoffs=[]
)
model = XLNetLMHeadModel(config=config)
# model = XLNetLMHeadModel.from_pretrained("../input/xlnet-gpu/my-xlnet-model/checkpoint-485280")
model.num_parameters()

In [None]:
model = model.to(device)

In [None]:
from transformers import DataCollatorForPermutationLanguageModeling
data_collator = DataCollatorForPermutationLanguageModeling(tokenizer=tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments

"""
Batch size                  8192 x
Learning rate               4e-4 x
Number of steps             500K x
Warmup steps                40.000 x
Learning rate decay         linear x
Adam epsilon                1e-6 x
Weigth decay                0.01 x
"""

training_args = TrainingArguments(
    output_dir="my-xlnet-model",
    overwrite_output_dir=True,
    do_train=True,
    num_train_epochs=16,
    per_device_train_batch_size=8192,
    learning_rate=4e-4,
    weight_decay=0.01,
    max_steps=500_000,
    adam_epsilon=1e-6,
    warmup_steps=40_000,
    save_steps=1011,
    save_total_limit=2,
    prediction_loss_only=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_datasets["train"],
)

In [None]:
trainer.train()
# trainer.train(model_path="../input/xlnet-gpu/my-xlnet-model/checkpoint-485280")
trainer.save_model("my-xlnet-model")
tokenizer.save_pretrained("my-xlnet-model")

In [None]:
# Keep track of train and evaluate loss.
loss_history = {'train_loss':[], 'eval_loss':[]}

# Keep track of train and evaluate perplexity.
# This is a metric useful to track for language models.
perplexity_history = {'train_perplexity':[], 'eval_perplexity':[]}

# Loop through each log history.
for log_history in trainer.state.log_history:
    if 'loss' in log_history.keys():
        # Deal with trianing loss.
        loss_history['train_loss'].append(log_history['loss'])
        perplexity_history['train_perplexity'].append(math.exp(log_history['loss']))
    
    elif 'eval_loss' in log_history.keys():
        # Deal with eval loss.
        loss_history['eval_loss'].append(log_history['eval_loss'])
        perplexity_history['eval_perplexity'].append(math.exp(log_history['eval_loss']))

# Plot Losses.
plot_dict(loss_history, start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps, use_title='Loss', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=2)

print()

# Plot Perplexities.
plot_dict(perplexity_history, start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps, use_title='Perplexity', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=2)