In [1]:
# config according to where the notebook is running (mac // datalab gcp)

datalab = False
output_dir = f"../bucket/results_04_04_25" if datalab else "./results"
resume_from_checkpoint = False
checkpoint_path = "../bucket/results/checkpoint-" if datalab else "/Users/mgg/dev/projets/fine-tuning/cp/checkpoint-11750"
path_dataset = "./data/boosted_data_test.json"

In [2]:
# source : https://colab.research.google.com/drive/1DqKNPOzyMUXmJiJFvJITOahVDxCrA-wA#scrollTo=9Ixtdtpgyv_a

from text_gen_model import InstructionTextGenerationPipeline, AcronymDataset
from transformers import TrainingArguments, Trainer
import torch

device = torch.device("cuda") if datalab else torch.device('mps')
# model_name: str = "mosaicml/mpt-1b-redpajama-200b-dolly"
model_name: str = "meta-llama/Llama-3.2-1B-Instruct"
torch_dtype = torch.bfloat16

text_gen_pipeline = InstructionTextGenerationPipeline(model_name=model_name, torch_dtype=torch_dtype, device=device) # custom pipeline - could have subclassed hf pipeline

## 1 - Loads Datasets

In [3]:
import json
from random import shuffle

with open(path_dataset, "rt") as f:
    boosted_data = json.load(f)

dataset = text_gen_pipeline.tokenizer.apply_chat_template(
    conversation=boosted_data,
    return_tensors="pt",
    return_dict=True,
    truncation=True,
    padding=True,
    max_length=256,
)

train_dataset, test_dataset = AcronymDataset(dataset), AcronymDataset(dataset) # we DO want overfitting here -> acronym memorization

In [4]:
dataset

{'input_ids': tensor([[128000, 128006,   9125,  ..., 128009, 128009, 128009],
        [128000, 128006,   9125,  ..., 128009, 128009, 128009],
        [128000, 128006,   9125,  ..., 128009, 128009, 128009],
        ...,
        [128000, 128006,   9125,  ..., 128009, 128009, 128009],
        [128000, 128006,   9125,  ..., 128009, 128009, 128009],
        [128000, 128006,   9125,  ..., 128009, 128009, 128009]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

## 2 - Training

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [6]:
print_trainable_parameters(text_gen_pipeline.model)

trainable params: 1235814400 || all params: 1235814400 || trainable%: 100.00


In [7]:
# see summary of model
text_gen_pipeline.model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [8]:
# wraps model as PeftModel for LoRA training
# https://huggingface.co/docs/peft/main/task_guides/semantic_segmentation_lora

from peft import LoraConfig, get_peft_model


config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="lora_only",
    modules_to_save=["decode_head"],
)
lora_model = get_peft_model(text_gen_pipeline.model, config)

print_trainable_parameters(lora_model) # see difference in number of parameters

trainable params: 6815744 || all params: 1242630144 || trainable%: 0.55


In [9]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=64,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1000,
    resume_from_checkpoint=None if not resume_from_checkpoint else checkpoint_path,
    save_steps=5000, # saves a checkpoint every .. steps
)

In [10]:
# Create the Trainer and train
trainer = Trainer(
    model=lora_model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset             
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [11]:
len(train_dataset)

33

In [12]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=165, training_loss=2.514612741181345, metrics={'train_runtime': 32.8257, 'train_samples_per_second': 5.027, 'train_steps_per_second': 5.027, 'total_flos': 84404114657280.0, 'train_loss': 2.514612741181345, 'epoch': 5.0})

## Test the model

In [13]:
text_gen_pipeline.model.eval() # arrête l'entraînement ?

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=32, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=32, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=512, bias=False)
            (lora_dropout): ModuleDict(
              (default):

In [14]:
from transformers import pipeline
chat = pipeline("text-generation", model=text_gen_pipeline.model, tokenizer=text_gen_pipeline.tokenizer, max_new_tokens=500)
def q_a(question):
    return chat([{
        "role": "user",
        "content": question
    }])[0]["generated_text"][1]["content"]

Device set to use mps:0


In [15]:
q_a("Rephrase, boost and develop the following sentence by explaining everything : 'NumPEx is a French project in the field of computer science; whose Exa-AToW is one of the sub-project.'")

'NumPEx is a French initiative in the field of computer science, specifically focused on developing scalable and efficient architectures for the exascale era. The Exa-AToW sub-project, which stands for "Exascale Architectures for Very Large Systems," represents a critical component of this endeavor.'

In [16]:
for i in range(10):
    print(q_a("What is NumPEx ?")) 
    print("--------\n")

NumPEx stands for Numérique pour l'Exascale, which is an initiative aimed at promoting the development of exascale computing, a new scale of computing that can handle complex calculations at speeds and scales that are currently beyond the reach of today's fastest supercomputers.
--------

NumPEx stands for Numérique Pour l'Exascale, which is a term used to describe the use of digital technologies for exascale computing.
--------

NumPEx stands for Numérique pour l'Exascale, it is a term used to describe the application of digital technologies to large-scale computing systems, particularly in the context of exascale computing.
--------

NumPEx is Numérique Pour l'Exascale, a French term that refers to the field of exascale computing.
--------

NumPEx stands for Numérique pour l'Exascale, a field of research and development that focuses on the development of technologies and methods for the next generation of computing systems, specifically those that will be used in the exascale era.
--

In [17]:
q_a("Explain to me the meaning of Exa-AToW")

'Exa-AToW is an acronym that stands for Exascale Architecture for Transformation.'