In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.version.cuda

'11.8'

##### `load model`

In [3]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


##### `Test the Model with Zero Shot Inferencing`

In [5]:
index = 200


prompt = f"""
answer the following question.

Question: what is a neural network?

answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

MODEL GENERATION - ZERO SHOT:
neural network


##### `Setup the PEFT/LoRA model for Fine-Tuning`

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [7]:
### Add LoRA adapter layers/parameters to the original LLM to be trained.

peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [8]:
dataset = load_dataset("text", data_files=r"E:\HF-\test_file\*.pdf", encoding="latin-1")

from datasets import Dataset

def preprocess_function(examples):
    # Split the data into training and test sets
    train_size = int(len(examples) * 0.8)  # 80% for training
    train_examples = examples[:train_size]
    test_examples = examples[train_size:]

    # Preprocess the training examples
    train_first_sentences = train_examples["text"][::2]
    train_second_sentences = train_examples["text"][1::2]
    train_next_sentence_labels = [1] * len(train_second_sentences)

    train_model_inputs = tokenizer(train_first_sentences, train_second_sentences, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    train_model_inputs["next_sentence_label"] = torch.tensor(train_next_sentence_labels)

    train_dataset = Dataset.from_dict(train_model_inputs)

    # Preprocess the test examples
    test_first_sentences = test_examples["text"][::2]
    test_second_sentences = test_examples["text"][1::2]
    test_next_sentence_labels = [1] * len(test_second_sentences)

    test_model_inputs = tokenizer(test_first_sentences, test_second_sentences, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    test_model_inputs["next_sentence_label"] = torch.tensor(test_next_sentence_labels)

    test_dataset = Dataset.from_dict(test_model_inputs)

    return train_dataset, test_dataset

# Process the dataset
processed_train_dataset, processed_test_dataset = preprocess_function(dataset['train'])



In [9]:
processed_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'next_sentence_label'],
    num_rows: 134976
})

##### `Train PEFT Adapter`

Define training arguments and create `Trainer` instance.

In [10]:
output_dir = f'./peft-flant5b-ft-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=processed_train_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [11]:
output_dir

'./peft-flant5b-ft-1717065608'

In [14]:
# training

# ! set WANDB_NOTEBOOK_NAME=flan_t5_base_Ft.ipynb
! set WANDB_MODE=offline
peft_trainer.train()


  0%|          | 0/1 [00:17<?, ?it/s]


ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [None]:
peft_model_path="./peft-flant5b-ft-CheckPoint_local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)