In [None]:
!pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.28.0


In [None]:
# we upgraded `accelerate` just because to import Trainer API
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from glob import glob
from datasets import load_dataset

2024-03-20 18:55:56.736288: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-20 18:55:56.736417: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-20 18:55:56.859343: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
MAX_LENGTH = 64
EPOCHS = 3
MODEL = 'distilbert/distilgpt2'
DATASET_NAME = 'wiki_movies'

In [None]:
class DataPreprocessor:
    def __init__(self, model_name, max_length):
        # Initialize the DataPreprocessor with the specified model name and max_length
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.pad_token = "<pad>"

    def load_dataset(self, dataset_name):
        # Load the dataset using the specified split
        try:
            print(f"Loading {dataset_name} dataset...")
            train_ds, validation_ds = load_dataset(dataset_name, split=['train[:30%]', 'validation[:15%]'])
            print(f"Loaded {dataset_name} dataset.")
            return train_ds, validation_ds
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return None, None

    def preprocess(self, dataset):
        print("Preprocessing dataset...")
        try:
            def preprocess(example):
                example["text"] = (example["question"] + " " + example["answer"])
                return example

            preprocessed_dataset = dataset.map(preprocess, remove_columns=["question", "answer"])
            print("Dataset preprocessing completed.")
            return preprocessed_dataset
        except Exception as e:
            print(f"Error preprocessing dataset: {e}")
            return None

    def tokenize(self, dataset):
        # Tokenize the dataset
        print("Tokenizing dataset...")
        def tokenize_function(examples):
            return self.tokenizer(examples["text"], max_length=self.max_length, truncation=True, padding="max_length")

        try:
            tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])
            print("Dataset tokenization completed.")
            return tokenized_dataset
        except Exception as e:
            print(f"Error tokenizing dataset: {e}")
            return None

    def add_labels(self, dataset):
        # Add labels to the dataset
        print("Adding labels to dataset...")
        def copy_input_ids(example):
            example["labels"] = example["input_ids"].copy()
            return example

        try:
            labeled_dataset = dataset.map(copy_input_ids)
            print("Labels added to dataset.")
            return labeled_dataset
        except Exception as e:
            print(f"Error adding labels to dataset: {e}")
            return None

    def preprocess_pipeline(self, dataset_name):
        # Execute the preprocessing pipeline
        train_ds, validation_ds = self.load_dataset(dataset_name)
        if train_ds is None or validation_ds is None:
            # Dataset loading failed, return None
            print("Preprocessing pipeline aborted due to dataset loading error.")
            return None, None

        train_ds = self.preprocess(train_ds)
        validation_ds = self.preprocess(validation_ds)
        if train_ds is None or validation_ds is None:
            # Dataset preprocessing failed, return None
            print("Preprocessing pipeline aborted due to dataset preprocessing error.")
            return None, None

        train_ds = self.tokenize(train_ds)
        validation_ds = self.tokenize(validation_ds)
        if train_ds is None or validation_ds is None:
            # Dataset tokenization failed, return None
            print("Preprocessing pipeline aborted due to dataset tokenization error.")
            return None, None

        train_ds = self.add_labels(train_ds)
        validation_ds = self.add_labels(validation_ds)
        if train_ds is None or validation_ds is None:
            # Adding labels failed, return None
            print("Preprocessing pipeline aborted due to label addition error.")
            return None, None

        return train_ds, validation_ds

In [None]:
tokenized_train_ds, tokenized_validation_ds = DataPreprocessor(MODEL, MAX_LENGTH).preprocess_pipeline(DATASET_NAME)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loading wiki_movies dataset...


Downloading builder script:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading and preparing dataset wiki_movies/default (download: 54.43 MiB, generated: 8.38 MiB, post-processed: Unknown size, total: 62.80 MiB) to /root/.cache/huggingface/datasets/wiki_movies/default/1.1.0/2fab0fed49fad4c5854fcf8d4e958439d961e0d7de5d5ed2ca9ce54e309347cd...


Downloading data:   0%|          | 0.00/57.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/96185 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9952 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset wiki_movies downloaded and prepared to /root/.cache/huggingface/datasets/wiki_movies/default/1.1.0/2fab0fed49fad4c5854fcf8d4e958439d961e0d7de5d5ed2ca9ce54e309347cd. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Loaded wiki_movies dataset.
Preprocessing dataset...


  0%|          | 0/28856 [00:00<?, ?ex/s]

Dataset preprocessing completed.
Preprocessing dataset...


  0%|          | 0/1500 [00:00<?, ?ex/s]

Dataset preprocessing completed.
Tokenizing dataset...
   

#0:   0%|          | 0/15 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/15 [00:00<?, ?ba/s]

Dataset tokenization completed.
Tokenizing dataset...
   

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset tokenization completed.
Adding labels to dataset...


  0%|          | 0/28856 [00:00<?, ?ex/s]

Labels added to dataset.
Adding labels to dataset...


  0%|          | 0/1500 [00:00<?, ?ex/s]

Labels added to dataset.


In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL)

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    "gpt2-finetuned-on-wiki-movies",

    num_train_epochs=EPOCHS,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    dataloader_num_workers=2,

    evaluation_strategy = "steps",
    logging_strategy="steps",
    save_strategy="steps",
    eval_steps=0.1,
    logging_steps=0.1,
    save_steps=0.1,

    learning_rate=1e-5,
    weight_decay=0.01,
    save_total_limit=10,
    report_to='none',
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_validation_ds,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
train_output = trainer.train()
print(train_output)

Step,Training Loss,Validation Loss
136,1.9075,1.458691
272,1.0228,1.433036
408,0.978,1.43222
544,0.9544,1.433619
680,0.9272,1.438336
816,0.9349,1.438208
952,0.9179,1.439635
1088,0.9144,1.440559
1224,0.897,1.442181


TrainOutput(global_step=1353, training_loss=1.0378565763598975, metrics={'train_runtime': 516.0368, 'train_samples_per_second': 167.755, 'train_steps_per_second': 2.622, 'total_flos': 1413746069078016.0, 'train_loss': 1.0378565763598975, 'epoch': 3.0})


In [None]:
# TODO input prompt
prompt = "When was cars 2 released?"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
encoded_prompt = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt = encoded_prompt.to(trainer.model.device)

# prediction
output_sequences = trainer.model.generate(
    input_ids=encoded_prompt,
    max_length=64,
    min_length=1,
    temperature=1.,
    top_p=0.95,
    do_sample=True,
    num_return_sequences=10,
    pad_token_id=tokenizer.pad_token_id,
)

generated_sequences = []

# decode prediction
for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=False)
    generated_sequences.append(text.strip())


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
generated_sequences[3]

'When was cars 2 released?\n<|endoftext|><|endoftext|>'

In [None]:
directories = glob("/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-*")
directories.sort(key=lambda x: int(x.split("checkpoint-")[1]))

In [None]:
prompt_in_train = "When was cars 2 released?"  # in train data
prompt_not_in_train = "Which actors were in the movie cars 2?"  # NOT in train data - but similar
encoded_prompt_in_train = tokenizer(prompt_in_train, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt_not_in_train = tokenizer(prompt_not_in_train, add_special_tokens=False, return_tensors="pt").input_ids

for path in directories:
    print("--------------")
    print(path)
    print("--------------")
    _model = AutoModelForCausalLM.from_pretrained(path)

    for _encoded_prompt in [encoded_prompt_in_train, encoded_prompt_not_in_train]:
        output_sequences = _model.generate(
            input_ids=_encoded_prompt,
            max_length=64,
            min_length=10,
            temperature=1.,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
        )

        text = tokenizer.decode(output_sequences[0], clean_up_tokenization_spaces=True, skip_special_tokens=False)

        # Simplifying for demo
        question, answer = text.split("?")[:2]
        answer = answer.split(".")[0]
        print(question + "?", answer + "...")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-136
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  1997

Advertisements<|endoftext|>...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which actors were in the movie cars 2?  Micky Proust, Tom Batson
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-272
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  1965

Advertisements<|endoftext|>...
Which actors were in the movie cars 2?  Richard Gannon
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-408
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  1965
Advertisements
<|endoftext|>...
Which actors were in the movie cars 2?  Richard Farrar
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-544
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  2003
Advertisements
<|endoftext|>...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which actors were in the movie cars 2?  Davey, the Dolph Lundgren
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-680
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  1981
Advertisements
<|endoftext|>...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which actors were in the movie cars 2?  Bruce Springsteen, Bruce Springsteen, George Takei
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-816
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  1983
Advertisements
<|endoftext|>...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which actors were in the movie cars 2?  Chris Jones, Bob Dylan
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-952
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  1995
Advertisements
<|endoftext|>...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which actors were in the movie cars 2?  Frank Sinatra, I'm Here with you, Pops
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-1088
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  2002
Advertisements
<|endoftext|>...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Which actors were in the movie cars 2?  Joe, Bobby, John Belushi, John F...
--------------
/kaggle/working/gpt2-finetuned-on-wiki-movies/checkpoint-1224
--------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was cars 2 released?  2001
Advertisements
<|endoftext|>...
Which actors were in the movie cars 2?  Paul, Paulina
<|endoftext|>...
