In [25]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    set_seed
)

from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import os
import pandas as pd
import numpy as np
from huggingface_hub import login
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


hf_token = os.environ.get('HF_TOKEN')
login(token = hf_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/nata-brain/.cache/huggingface/token
Login successful


## Loading datasets

In [2]:
dialog_dataset_name = "neil-code/dialogsum-test"
dataset = load_dataset(dialog_dataset_name)

In [3]:
dataset['train'][0]

{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

In [4]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [5]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = device_map,
    quantization_config = bnb_config,
    trust_remote_code = True,
    use_auth_token = True
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code = True,
    padding_side = "left",
    add_eos_taken = True,
    add_bos_token = True,
    use_fast = False
)

tokenizer.pad_token = tokenizer.eos_token

In [7]:
eval_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code = True,
    add_bos_token = True, 
    use_fast = False
)

eval_tokenizer.pad_token = tokenizer.eos_token

In [8]:
def gen(model, p, max_len = 100, sample = True):
    toks = eval_tokenizer(p, return_tensors = "pt")
    res = model.generate(
        **toks.to("cuda"), 
        max_new_tokens = max_len, 
        do_sample = sample, 
        num_return_sequences = 1, 
        temperature = 0.1, 
        num_beams = 1, 
        top_p = 0.95
    ).to('cpu')
    
    return eval_tokenizer.batch_decode(res, skip_special_tokens = True)

In [9]:
def formatted_prompt(question) -> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [10]:
seed = 42
set_seed(seed)

index = 10

prompt = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt_request = f"Instruct: Summarize the following conversation.\n{prompt}\n"
prompt = formatted_prompt(prompt_request)
response = gen(original_model, prompt, 100,)

output = response[0].split('Output:\n')[0]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f"INPUT PROMPT:\n{prompt}")
print(dash_line)
print(f"BASELINE HUMAN SUMMARY:\n{summary}")
print(dash_line)
print(f"MODEL GENERATION - ZERO SHOT:\n{output}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<|im_start|>user
Instruct: Summarize the following conversation.
#Person1#: Happy Birthday, this is for you, Brian.
#Person2#: I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.
#Person1#: Brian, may I have a pleasure to have a dance with you?
#Person2#: Ok.
#Person1#: This is really wonderful party.
#Person2#: Yes, you are always popular with everyone. and you look very pretty today.
#Person1#: Thanks, that's very kind of you to say. I hope my necklace goes with my dress, and they both make me look good I feel.
#Person2#: You look great, you are absolutely glowing.
#Person1#: Thanks, this is a fine party. We should have a drink together to celebrate your birthday
<|im_end|>
<|im_start|>assistant:
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMM

# Train Model

In [13]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction','output')
    Then concatenate them using two newline characters 
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below conversation."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"
    
    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample["dialogue"] else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample

In [11]:
def get_max_length(model):
    conf = model.config
    max_length = None
    
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max length: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Usage default max length: {max_length}")
    
    return max_length

def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch['text'],
        max_length = max_length,
        truncation = True
    )

In [20]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset):
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)
    
    _preprocessing_function = partial(
        preprocess_batch, 
        max_length = max_length, 
        tokenizer = tokenizer
    )
    
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ['id', 'topic', 'dialogue', 'summary']
    )
    
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    dataset = dataset.shuffle(seed = seed)
    
    return dataset

In [28]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 18874368
all model parameters: 4559474688
percentage of trainable model parameters: 0.41%


In [21]:
max_length = get_max_length(original_model)

train_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['validation'])

Found max length: 131072
Preprocessing dataset...


Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1999 [00:00<?, ? examples/s]

Preprocessing dataset...


Map:   0%|          | 0/499 [00:00<?, ? examples/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

Filter:   0%|          | 0/499 [00:00<?, ? examples/s]

In [22]:
print(f"Shapes of the datasets:")
print(f"Training: {train_dataset.shape}")
print(f"Validation: {eval_dataset.shape}")
print(train_dataset)

Shapes of the datasets:
Training: (1999, 3)
Validation: (499, 3)
Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 1999
})


In [23]:
print(original_model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [26]:
config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    target_modules = [
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    
    bias = "none",
    lora_dropout = 0.05,
    task_type = "CAUSAL_LM"
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()

# 2 - Using the prepare_model_for_kbit_training method from PEFT
original_model = prepare_model_for_kbit_training(original_model)

peft_model = get_peft_model(original_model, config)


In [31]:
print(print_number_of_trainable_model_parameters(peft_model))
# See how the model looks different now, with the LoRA adapters added:
print(peft_model)

trainable model parameters: 18874368
all model parameters: 4559474688
percentage of trainable model parameters: 0.41%
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embed

## Train PEFT Adapter

In [35]:
output_dir = "../models/peft-dialogue-summary-training/final-checkpoint"
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model = peft_model,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    args = prft_training_args,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

max_steps is given, it will override any value given in num_train_epochs


In [36]:
peft_training_args.device

device(type='cuda', index=0)

In [37]:
peft_trainer.train()


  0%|          | 0/1000 [00:00<?, ?it/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.7045, 'grad_norm': 0.6754987835884094, 'learning_rate': 0.0001951951951951952, 'epoch': 0.05}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.5008522272109985, 'eval_runtime': 84.4229, 'eval_samples_per_second': 5.911, 'eval_steps_per_second': 0.746, 'epoch': 0.05}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.2611, 'grad_norm': 0.9706270694732666, 'learning_rate': 0.0001901901901901902, 'epoch': 0.1}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.5055707693099976, 'eval_runtime': 84.2097, 'eval_samples_per_second': 5.926, 'eval_steps_per_second': 0.748, 'epoch': 0.1}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.484, 'grad_norm': 0.408590167760849, 'learning_rate': 0.0001851851851851852, 'epoch': 0.15}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.4626483917236328, 'eval_runtime': 86.3337, 'eval_samples_per_second': 5.78, 'eval_steps_per_second': 0.73, 'epoch': 0.15}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.2367, 'grad_norm': 0.5957475900650024, 'learning_rate': 0.00018018018018018018, 'epoch': 0.2}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.4779329299926758, 'eval_runtime': 84.1587, 'eval_samples_per_second': 5.929, 'eval_steps_per_second': 0.749, 'epoch': 0.2}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
