## Preprocessing and training a pretrained model

### <A HREF="https://huggingface.co/docs/transformers/preprocessing">Preprocessing</A><BR><A HREF="https://huggingface.co/docs/transformers/training">Training</A>

In [1]:
from transformers import AutoTokenizer
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict
import math # for perplexity evaluation
from transformers import TrainingArguments
from transformers import Trainer

In [41]:
# Some memory cleanup
del model
del trainer
torch.cuda.empty_cache()

In [2]:
if not torch.cuda.is_available():
    print("CUDA not available")
    quit()

In [3]:
# Load UCSB text files with the speeches
path = './Data/DataUCSB/'

'''
list_of_files = []

for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.txt'):
            list_of_files.append(os.path.join(root,file))

speeches = []
for file in list_of_files:
    with open(file, encoding='utf8') as f:
        text = f.read()
    speeches.append(text)
'''

### Preprocessing

In [4]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
# downloads model, about 256MB

In [5]:
# distilbert trains faster than vanilla bert with little loss in downstream performance, apparently
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
# Clean up speeches just a little, and put all sentences in a list for tokenizer
speecheslist = [speech.split('.') for speech in speeches]
# How to flatten a list = [item for sublist in list_of_lists for item in sublist]
sentences = [sentence for speech in speecheslist for sentence in speech]
batch_sentences = [sentence.replace('[Laughter]', '').strip().lower() for sentence in sentences]

In [47]:
''' MAY NOT NEED THIS ANYMORE
# tokenize and encode sentences with padding and truncattion
# Return tensors to feed to the model, "pt" =  PyTorch, "tf" = TensorFlow [, return_tensors="pt"]
encoded_train = tokenizer(train, padding="max_length", truncation=True, return_tensors="pt")
encoded_test = tokenizer(test, padding="max_length", truncation=True, return_tensors="pt")
encoded_sentences = tokenizer(batch_sentences, padding="max_length", truncation=True, return_tensors="pt")
'''

' MAY NOT NEED THIS ANYMORE\n# tokenize and encode sentences with padding and truncattion\n# Return tensors to feed to the model, "pt" =  PyTorch, "tf" = TensorFlow [, return_tensors="pt"]\nencoded_train = tokenizer(train, padding="max_length", truncation=True, return_tensors="pt")\nencoded_test = tokenizer(test, padding="max_length", truncation=True, return_tensors="pt")\nencoded_sentences = tokenizer(batch_sentences, padding="max_length", truncation=True, return_tensors="pt")\n'

### Training - masked language modeling

<A HREF="https://huggingface.co/course/chapter7/3?fw=pt">huggingface: Fine Tuning a masked language model</A>

In [7]:
# Trainer is spitting out memory errors, will try 64 instead of 128, see if it works
#chunk_size = 128 # smaller than model_max_length of 512, for memory considerations
chunk_size = 32

In [8]:
dataset = load_dataset(path)

Resolving data files:   0%|          | 0/255 [00:00<?, ?it/s]

Using custom data configuration DataUCSB-5ec176c9a6b6170e
Found cached dataset text (C:/Users/peter/.cache/huggingface/datasets/text/DataUCSB-5ec176c9a6b6170e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
train_testvalid = dataset["train"].train_test_split(seed = 33, test_size=0.2)
test_valid = train_testvalid["test"].train_test_split(seed = 33, test_size=0.5)
oba_data = DatasetDict({
    "train": train_testvalid["train"],
    "test": test_valid["test"],
    "valid": test_valid["train"]
})

Loading cached split indices for dataset at C:\Users\peter\.cache\huggingface\datasets\text\DataUCSB-5ec176c9a6b6170e\0.0.0\cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2\cache-d9a2ef813246c68c.arrow and C:\Users\peter\.cache\huggingface\datasets\text\DataUCSB-5ec176c9a6b6170e\0.0.0\cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2\cache-e9bcacbd605169b3.arrow
Loading cached split indices for dataset at C:\Users\peter\.cache\huggingface\datasets\text\DataUCSB-5ec176c9a6b6170e\0.0.0\cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2\cache-aac2259e94624552.arrow and C:\Users\peter\.cache\huggingface\datasets\text\DataUCSB-5ec176c9a6b6170e\0.0.0\cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2\cache-21208b586ce01c03.arrow


In [10]:
oba_data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4474
    })
    test: Dataset({
        features: ['text'],
        num_rows: 560
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 559
    })
})

In [11]:
'''  MAY NOT NEED THIS ANYMORE
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128, return_tensors="pt")
    return tokenized_batch

enc_dataset = oba_data.map(tokenize_function, batched = True)
'''

'  MAY NOT NEED THIS ANYMORE\ndef tokenize_function(batch):\n    tokenized_batch = tokenizer(batch[\'text\'], padding=True, truncation=True, max_length=128, return_tensors="pt")\n    return tokenized_batch\n\nenc_dataset = oba_data.map(tokenize_function, batched = True)\n'

In [12]:
def tokenize_function(batch):
    result = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=chunk_size, return_tensors="pt")
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = oba_data.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 4474
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 560
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 559
    })
})

In [13]:
#tokenized_datasets2 = tokenized_datasets.remove_columns(books_dataset["train"].column_names)

In [14]:
# Mask 15% of the tokens
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [15]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 4474
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 560
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 559
    })
})

In [16]:
# Don't need to downsample
downsampled_dataset = lm_datasets

In [17]:
# Login to huggingface to run the trainer - need to specify write permission, read will not work
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to C:\Users\peter/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [18]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [19]:


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

C:\Users\peter\Documents\tBK_work\distilbert-base-uncased-finetuned-imdb is already a clone of https://huggingface.co/peterday/distilbert-base-uncased-finetuned-imdb. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend


In [22]:
# Keep running into memory errors here, hmmmmm
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4474
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 210


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 3.47 GiB already allocated; 0 bytes free; 3.50 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [21]:
torch.cuda.empty_cache()

In [23]:
torch.cuda.memory_allocated(device=None)

3725117952