In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, Trainer, TrainingArguments, AutoConfig
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from tqdm import tqdm
import time
import evaluate
import pickle

## Dataset and model

We are going to do summarization task, with instructed foundation T5 model

In [5]:
dataset_name = 'knkarthick/dialogsum'
model_name = 'google/flan-t5-base'

In [None]:
dataset = load_dataset(dataset_name)
model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_base.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [7]:
def llm_reply(model, tokenizer, dialogue):
  prompt = f"""
  Summerize the following conversation:

  Dialogue:

  {dialogue}

  Summary:

  """
  encoded_prompt = tokenizer(prompt, padding = 'max_length', return_tensors = 'pt')['input_ids'].to(device)
  answer = tokenizer.decode(
      model.generate(
          input_ids = encoded_prompt,
          generation_config = GenerationConfig(
              max_new_tokens = 50,
              temperature = 1.0,
          ))[0],
          skip_special_tokens = True
      )
  return answer

## Testing model
We first evaluate the performance of the model for this summarization task

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [15]:
dialogue = dataset['test'][0]['dialogue']
print("Dialogue:\n",dialogue)
print("-"*100)
print("Summary:")
print(llm_reply(model_base, tokenizer, dialogue))
print("-"*100)
print("GT Summary:")
print(dataset['test'][0]['summary'])

Dialogue:
 #Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with t

Evaluate base model with rouge score

In [16]:
# test
groundtruth_summary_test = dataset['test']['summary']
llm_summary_test_base = []
for i in tqdm(range(len(dataset['test']))):
  dialogue = dataset['test'][i]['dialogue']
  llm_summary_test_base.append(
      llm_reply(model_base, tokenizer, dialogue)
  )

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [07:38<00:00,  3.27it/s]


In [17]:
rouge = evaluate.load('rouge')
result_base = rouge.compute(
    predictions = llm_summary_test_base,
    references = groundtruth_summary_test,
    use_aggregator = True,
    use_stemmer = True
)
print(f"Rouge score for base model on test dataset: \n{result_base}")

Rouge score for base model on test dataset: 
{'rouge1': 0.28317785960299324, 'rouge2': 0.09958983291484252, 'rougeL': 0.2375737739649582, 'rougeLsum': 0.2377417425739}


## Fine-tunning model with Lora
### Prepare dataset for training

In [22]:
def processing_data_training(example):
  prompts = []
  for dialogue in example['dialogue']:
    prompt = f"""
    Summerize the following conversation:
    
    Dialogue:
    
    {dialogue}
    
    Summary:
    
    """
    prompts.append(prompt)
  example['input_ids'] = tokenizer(prompts, padding = 'max_length', truncation = True, return_tensors = 'pt')['input_ids']
  example['labels'] = tokenizer(example['summary'], padding = 'max_length', truncation = True, return_tensors = 'pt')['input_ids']
  return example

In [23]:
encoded_dataset = dataset.map(processing_data_training, batched=True)
encoded_dataset = encoded_dataset.remove_columns(['id', 'dialogue', 'summary', 'topic'])

Map: 100%|███████████████████████████████████████████████████████████████████████████████| 12460/12460 [00:08<00:00, 1552.05 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 1333.54 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 1628.85 examples/s]


### Training configuration

In [24]:
out_dir = f'./qa_training_{str(int(time.time()))}'
lora_config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    target_modules = ['q', 'v'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = TaskType.SEQ_2_SEQ_LM
)
peft_model = get_peft_model(
    model_base,
    lora_config
)

In [25]:
training_arguments = TrainingArguments(
    output_dir = out_dir,
    auto_find_batch_size = True,
    learning_rate = 1e-3,
    num_train_epochs = 10,
    logging_steps = 1
)
trainer = Trainer(
    model = peft_model,
    args = training_arguments,
    train_dataset = encoded_dataset['train']
)

In [26]:
trainer.train()
peft_path = './peft_trained'
trainer.model.save_pretrained(peft_path)
tokenizer.save_pretrained(peft_path)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
1,48.75
2,45.5
3,43.25
4,37.5
5,32.5
6,29.25
7,26.125
8,23.375
9,20.875
10,18.0


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


('./peft_trained/tokenizer_config.json',
 './peft_trained/special_tokens_map.json',
 './peft_trained/tokenizer.json')

## Evaluation model fine-tune with Lora

In [27]:
model_flan_base = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)
peft_trained_model = PeftModel.from_pretrained(
    model_flan_base,
    peft_path,
    torch_dtype = torch.bfloat16,
    is_trainable = False
)
peft_trained_model.to(device)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=32, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=32, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
            

In [28]:
# test
groundtruth_summary_test = dataset['test']['summary']
llm_summary_test_peft = []
for i in tqdm(range(len(dataset['test']))):
  dialogue = dataset['test'][i]['dialogue']
  llm_summary_test_peft.append(
      llm_reply(peft_trained_model, tokenizer, dialogue)
  )

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [22:57<00:00,  1.09it/s]


In [29]:
rouge = evaluate.load('rouge')
result_peft = rouge.compute(
    predictions = llm_summary_test_peft,
    references = groundtruth_summary_test,
    use_aggregator = True,
    use_stemmer = True
)
print(f"Rouge score for base model on test dataset: \n{result_peft}")

Rouge score for base model on test dataset: 
{'rouge1': 0.41650270331274275, 'rouge2': 0.167213049629285, 'rougeL': 0.33195377986693386, 'rougeLsum': 0.33204282919948336}
