In [1]:
!pip install --upgrade accelerate



In [2]:
# we upgraded `accelerate` just because to import Trainer API
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from glob import glob
from datasets import load_dataset

2024-03-08 03:44:17.669836: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 03:44:17.669963: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 03:44:17.801127: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [19]:
train_ds, validation_ds = load_dataset('squad', split=['train[:30%]', 'validation[:20%]'])

  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
datasets

[Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 26280
 }),
 Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 2114
 })]

In [20]:
def preprocess(example):
    example["text"] = (example["question"] + " " + example["answers"]["text"][0])
    return example

train_ds = train_ds.map(preprocess, remove_columns=["id", "title", "context", "question", "answers"])
validation_ds = validation_ds.map(preprocess, remove_columns=["id", "title", "context", "question", "answers"])

  0%|          | 0/26280 [00:00<?, ?ex/s]

  0%|          | 0/2114 [00:00<?, ?ex/s]

In [21]:
# TODO : choose model name
MODEL_NAME = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = "<pad>"

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [22]:
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=64, truncation=True, padding="max_length")

In [23]:
tokenized_train_ds = train_ds.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])
tokenized_validation_ds = validation_ds.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

    

#0:   0%|          | 0/14 [00:00<?, ?ba/s]

#1:   0%|          | 0/14 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

In [24]:
def copy_input_ids(example):
    example["labels"] = example["input_ids"].copy()
    return example

In [25]:
tokenized_train_ds = tokenized_train_ds.map(copy_input_ids)
tokenized_validation_ds = tokenized_validation_ds.map(copy_input_ids)

  0%|          | 0/26280 [00:00<?, ?ex/s]

  0%|          | 0/2114 [00:00<?, ?ex/s]

In [26]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [27]:
training_args = TrainingArguments(
    "gpt2-finetuned-on-squad",
    
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    dataloader_num_workers=2,

    evaluation_strategy = "steps",
    logging_strategy="steps",
    save_strategy="steps",
    eval_steps=0.1,
    logging_steps=0.1,
    save_steps=0.1,

    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=10,
    report_to='none',
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_validation_ds,
)

In [29]:
train_output = trainer.train()
print(train_output)

Step,Training Loss,Validation Loss
206,1.1285,0.954368
412,0.982,0.949992
618,0.9142,0.962992
824,0.9043,0.967635
1030,0.8589,0.973458
1236,0.8618,0.977741
1442,0.8319,0.98388
1648,0.8281,0.983455
1854,0.8129,0.988707


TrainOutput(global_step=2055, training_loss=0.8935120965442518, metrics={'train_runtime': 1196.4095, 'train_samples_per_second': 109.829, 'train_steps_per_second': 1.718, 'total_flos': 4291721625600000.0, 'train_loss': 0.8935120965442518, 'epoch': 5.0})


In [30]:
# TODO input prompt
prompt = "What is Beyonce's full name?"
encoded_prompt = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt = encoded_prompt.to(trainer.model.device)

# prediction
output_sequences = trainer.model.generate(
    input_ids=encoded_prompt,
    max_length=64,
    min_length=1,
    temperature=1.,
    top_p=0.95,
    do_sample=True,
    num_return_sequences=10,
    pad_token_id=tokenizer.pad_token_id,
)

generated_sequences = []

# decode prediction
for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=False)
    generated_sequences.append(text.strip())


In [31]:
generated_sequences[3]

"What is Beyonce's full name? Beyoncé<|endoftext|><|endoftext|><|endoftext|>"

In [32]:
directories = glob("/kaggle/working/gpt2-finetuned-on-squad/checkpoint-*")
directories.sort(key=lambda x: int(x.split("checkpoint-")[1]))

In [33]:
prompt_in_train = "What is Beyonce's full name?"  # in train data
prompt_not_in_train = "Who was Mongolia's first president?"  # NOT in train data - but similar
encoded_prompt_in_train = tokenizer(prompt_in_train, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt_not_in_train = tokenizer(prompt_not_in_train, add_special_tokens=False, return_tensors="pt").input_ids

for path in directories:
    print("--------------")
    print(path)
    print("--------------")
    _model = AutoModelForCausalLM.from_pretrained(path)

    for _encoded_prompt in [encoded_prompt_in_train, encoded_prompt_not_in_train]:
        output_sequences = _model.generate(
            input_ids=_encoded_prompt,
            max_length=64,
            min_length=10,
            temperature=1.,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
        )

        text = tokenizer.decode(output_sequences[0], clean_up_tokenization_spaces=True, skip_special_tokens=False)
        
        # Simplifying for demo
        question, answer = text.split("?")[:2]
        answer = answer.split(".")[0]
        print(question + "?", answer + "...")

--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-206
--------------
What is Beyonce's full name?  Beyonce<|endoftext|>...
Who was Mongolia's first president?  Mongol prince
<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-412
--------------
What is Beyonce's full name?  Beyonce Mariah Carey<|endoftext|>...
Who was Mongolia's first president?  Emperor Wu Jin<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-618
--------------
What is Beyonce's full name?  Jennifer Lopez<|endoftext|>...
Who was Mongolia's first president?  Yuanxuan<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-824
--------------
What is Beyonce's full name?  Elizabeth Bower<|endoftext|>...
Who was Mongolia's first president?   Zhongguo Zhizhu<|endoftext|>...
--------------
/kaggle/working/gpt2-finetuned-on-squad/checkpoint-1030
--------------
What is Beyonce's full name?  Jennifer Hudson<|endoftext|>...
