In [1]:
!pip install --upgrade accelerate



In [2]:
# we upgraded `accelerate` just because to import Trainer API
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from glob import glob
from datasets import load_dataset
import warnings
from datasets import Dataset
warnings.filterwarnings("ignore")
MODEL_NAME = "gpt2"

2024-03-12 09:34:34.671372: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-12 09:34:34.671475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-12 09:34:34.800086: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:

class DataPreprocessor:
    def __init__(self, model_name="gpt2", max_length=64):
        # Initialize the DataPreprocessor with the specified model name and max_length
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.pad_token = "<pad>"
    
    def load_dataset(self, dataset_name, split=['train[:30%]', 'validation[:20%]']):
        # Load the dataset using the specified split
        try:
            print(f"Loading {dataset_name} dataset...")
            train_ds, validation_ds = load_dataset(dataset_name, split=split)
            print(f"Loaded {dataset_name} dataset.")
            return train_ds, validation_ds
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return None, None
    
    def preprocess(self, dataset):
        # Preprocess the dataset by combining question and answer text
        print("Preprocessing dataset...")
        def combine_text(example):
            example["text"] = (example["question"] + " " + example["answers"]["text"][0])
            return example
        
        try:
            preprocessed_dataset = dataset.map(combine_text, remove_columns=["id", "title", "context", "question", "answers"])
            print("Dataset preprocessing completed.")
            return preprocessed_dataset
        except Exception as e:
            print(f"Error preprocessing dataset: {e}")
            return None
    
    def tokenize(self, dataset):
        # Tokenize the dataset
        print("Tokenizing dataset...")
        def tokenize_function(examples):
            return self.tokenizer(examples["text"], max_length=self.max_length, truncation=True, padding="max_length")
        
        try:
            tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])
            print("Dataset tokenization completed.")
            return tokenized_dataset
        except Exception as e:
            print(f"Error tokenizing dataset: {e}")
            return None
    
    def add_labels(self, dataset):
        # Add labels to the dataset
        print("Adding labels to dataset...")
        def copy_input_ids(example):
            example["labels"] = example["input_ids"].copy()
            return example
        
        try:
            labeled_dataset = dataset.map(copy_input_ids)
            print("Labels added to dataset.")
            return labeled_dataset
        except Exception as e:
            print(f"Error adding labels to dataset: {e}")
            return None
    
    def preprocess_pipeline(self, dataset_name, split=['train[:30%]', 'validation[:20%]']):
        # Execute the preprocessing pipeline
        train_ds, validation_ds = self.load_dataset(dataset_name, split)
        if train_ds is None or validation_ds is None:
            # Dataset loading failed, return None
            print("Preprocessing pipeline aborted due to dataset loading error.")
            return None, None
        
        train_ds = self.preprocess(train_ds)
        validation_ds = self.preprocess(validation_ds)
        if train_ds is None or validation_ds is None:
            # Dataset preprocessing failed, return None
            print("Preprocessing pipeline aborted due to dataset preprocessing error.")
            return None, None
        
        train_ds = self.tokenize(train_ds)
        validation_ds = self.tokenize(validation_ds)
        if train_ds is None or validation_ds is None:
            # Dataset tokenization failed, return None
            print("Preprocessing pipeline aborted due to dataset tokenization error.")
            return None, None
        
        train_ds = self.add_labels(train_ds)
        validation_ds = self.add_labels(validation_ds)
        if train_ds is None or validation_ds is None:
            # Adding labels failed, return None
            print("Preprocessing pipeline aborted due to label addition error.")
            return None, None
        
        return train_ds, validation_ds

# Usage
preprocessor = DataPreprocessor()
tokenized_train_ds, tokenized_validation_ds = preprocessor.preprocess_pipeline('squad')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loading squad dataset...


Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Loaded squad dataset.
Preprocessing dataset...


  0%|          | 0/26280 [00:00<?, ?ex/s]

Dataset preprocessing completed.
Preprocessing dataset...


  0%|          | 0/2114 [00:00<?, ?ex/s]

Dataset preprocessing completed.
Tokenizing dataset...
   

#0:   0%|          | 0/14 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/14 [00:00<?, ?ba/s]

Dataset tokenization completed.
Tokenizing dataset...
   

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

Dataset tokenization completed.
Adding labels to dataset...


  0%|          | 0/26280 [00:00<?, ?ex/s]

Labels added to dataset.
Adding labels to dataset...


  0%|          | 0/2114 [00:00<?, ?ex/s]

Labels added to dataset.


In [4]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
training_args = TrainingArguments(
    "gpt2-finetuned-on-squad",
    
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    dataloader_num_workers=2,

    evaluation_strategy = "steps",
    logging_strategy="steps",
    save_strategy="steps",
    eval_steps=0.1,
    logging_steps=0.1,
    save_steps=0.1,

    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=10,
    report_to='none',
)

In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_validation_ds,
)

In [7]:
train_output = trainer.train()
print(train_output)

Step,Training Loss,Validation Loss
103,1.2304,0.95394
206,0.9968,0.951242
309,0.9389,0.960785
412,0.9294,0.962202
515,0.893,0.96503
618,0.8959,0.967117
721,0.872,0.971241
824,0.8677,0.970373
927,0.8559,0.974493
1030,0.8549,0.973988


TrainOutput(global_step=1030, training_loss=0.9334992936513956, metrics={'train_runtime': 1437.364, 'train_samples_per_second': 91.417, 'train_steps_per_second': 0.717, 'total_flos': 4291721625600000.0, 'train_loss': 0.9334992936513956, 'epoch': 5.0})


In [9]:
# TODO input prompt
prompt = "What is Beyonce's full name?"
encoded_prompt = preprocessor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt = encoded_prompt.to(trainer.model.device)
# prediction
output_sequences = trainer.model.generate(
    input_ids=encoded_prompt,
    max_length=64,
    min_length=1,
    temperature=1.,
    top_p=0.95,
    do_sample=True,
    num_return_sequences=10,
    pad_token_id=preprocessor.tokenizer.pad_token_id,
)

generated_sequences = []

# decode prediction
for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = preprocessor.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=False)
    generated_sequences.append(text.strip())


In [10]:
generated_sequences[3]

"What is Beyonce's full name? Taylor Swift<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>"

In [11]:
directories = glob("/kaggle/working/gpt2-finetuned-on-squad/checkpoint-*")
directories.sort(key=lambda x: int(x.split("checkpoint-")[1]))

In [14]:
prompt_in_train = "What is Beyonce's full name?"  # in train data
prompt_not_in_train = "Who was Mongolia's first president?"  # NOT in train data - but similar
encoded_prompt_in_train = preprocessor.tokenizer(prompt_in_train, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt_not_in_train = preprocessor.tokenizer(prompt_not_in_train, add_special_tokens=False, return_tensors="pt").input_ids

for path in directories:
    print("--------------")
    print(path)
    print("--------------")
    _model = AutoModelForCausalLM.from_pretrained(path)

    for _encoded_prompt in [encoded_prompt_in_train, encoded_prompt_not_in_train]:
        output_sequences = _model.generate(
            input_ids=_encoded_prompt,
            max_length=64,
            min_length=10,
            temperature=1.,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=preprocessor.tokenizer.pad_token_id,
        )

        text = preprocessor.tokenizer.decode(output_sequences[0], clean_up_tokenization_spaces=True, skip_special_tokens=False)
        
        # Simplifying for demo
        question, answer = text.split("?")[:2]
        answer = answer.split(".")[0]
        print(question + "?", answer + "...")

--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-103
--------------
What is Beyonce's full name?  Sasha Banks<|endoftext|>...
Who was Mongolia's first president?  Jiang Zemin<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-206
--------------
What is Beyonce's full name?  Miss Universe<|endoftext|>...
Who was Mongolia's first president?  the Dalai Lama<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-309
--------------
What is Beyonce's full name?  Jane Austen<|endoftext|>...
Who was Mongolia's first president?  Ming Kublai<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-412
--------------
What is Beyonce's full name?   Rosemary<|endoftext|>...
Who was Mongolia's first president?  Yücel<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-515
--------------
What is Beyonce's full name?   Jennifer Lopez<|endoftext|>...
Who was Mongolia's first