In [1]:
import torch

datalab: bool = False # wheter the notebook is running on datalab gcp (True) or locally on mac with mps (False)
resume_from_checkpoint: bool = False # resume from last cp. If True; set checkpoint path to an existing checkpoint

do_lora:bool = False # whether to do lora fine tuning or juste last layer fine tuning
device: torch.device = torch.device("cuda") if datalab else torch.device("mps")
model_name: str = "meta-llama/Llama-3.2-1B-Instruct"
torch_dtype: torch.dtype = torch.bfloat16
max_new_tokens:int  = 50
output_dir: str = "../bucket/results_04_04_25" if datalab else "./results"
checkpoint_path: str = "../bucket/results/checkpoint-" if datalab else "/Users/mgg/dev/projets/fine-tuning/cp/checkpoint-11750"
path_dataset: str = "./data/boosted_data.json"


In [2]:
# source : https://colab.research.google.com/drive/1DqKNPOzyMUXmJiJFvJITOahVDxCrA-wA#scrollTo=9Ixtdtpgyv_a

from transformers import AutoModelForCausalLM, AutoTokenizer

# loads generative model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch_dtype)
tokenizer.pad_token = tokenizer.eos_token # add a padding token, otherwise it raises an error

In [3]:
from transformers.pipelines import pipeline
# loads two pipeline : one for fine tuning, and the other to keep a view on not fine tuned model

raw_model_pipeline = pipeline("text-generation", model=model_name, tokenizer=tokenizer, max_new_tokens=max_new_tokens)

Device set to use mps:0


## 1 - Loads the training dataset in a hugging face Dataset

In [4]:
import json
from datasets import Dataset

with open(path_dataset, "rt") as f:
    boosted_data = json.load(f)

data_prop = .05
boosted_data = boosted_data[:int(data_prop*len(boosted_data))]
print(f"Length of dataset : {len(boosted_data)}")

tokenized_conversations = tokenizer.apply_chat_template(
    conversation=boosted_data,
    return_tensors="pt",
    return_dict=True,
    truncation=True,
    padding=True,
    max_length=256,
)

tokenized_conversations["labels"] = tokenized_conversations["input_ids"]

train_dataset = Dataset.from_dict(tokenized_conversations)

Length of dataset : 117


In [5]:
# view on dataset

train_dataset, train_dataset[91]

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 117
 }),
 {'input_ids': [128000,
   128006,
   9125,
   128007,
   271,
   38766,
   1303,
   33025,
   2696,
   25,
   6790,
   220,
   2366,
   18,
   198,
   15724,
   2696,
   25,
   220,
   1419,
   5186,
   220,
   2366,
   20,
   271,
   128009,
   128006,
   882,
   128007,
   271,
   6854,
   499,
   7124,
   9995,
   12,
   5028,
   1149,
   220,
   17,
   30,
   128009,
   128006,
   78191,
   128007,
   271,
   2181,
   374,
   279,
   9995,
   12,
   5028,
   1149,
   220,
   17,
   5907,
   13,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128

## 2 - Training

In [6]:
import random
from transformers.pipelines.base import Pipeline
from transformers.trainer_callback import TrainerCallback


DATA_SAMPLE: int = random.randint(0, len(boosted_data)-1)

def test_model_on_one_question(raw_model: Pipeline, ft_model: Pipeline, boosted_data: list[dict], question_idx: int = DATA_SAMPLE) -> str:
    """
    Asks the model for a question in the dataset and returns the result in a str.
    :param raw_model: the source model, not fine tuned in a hugging face pipeline
    :param ft_model: the fine tuned model in a hugging face pipeline
    :param boosted_data: the list of conversations, in chat template
    :param question_idx: the index in the dataset of the asked question, optional default to random constant.
    :return: a str containing the question, the expected answer, 2 samples of the answer
     by the fine tuned model and 1 sample of the answer by the raw model (not fine tuned).
    """
    question = boosted_data[question_idx][0]["content"]
    input_chat = [{
        "role": "user",
        "content": question
    }]
    
    answer_no_fine_tuning = raw_model(input_chat)[0]["generated_text"][1]["content"]
    answer_fine_tuning = ft_model(input_chat)[0]["generated_text"][1]["content"]
    answer_fine_tuning_2 = ft_model(input_chat)[0]["generated_text"][1]["content"]
    ground_truth = boosted_data[question_idx][1]["content"]
    return f"""
        question: {question}\n
        answer_no_fine_tuning : {answer_no_fine_tuning}\n
        answer_fine_tuning : {answer_fine_tuning}\n
        answer_fine_tuning_2 : {answer_fine_tuning_2}\n
        ground_truth : {ground_truth}
    """

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )


class CustomCallback(TrainerCallback):
    """
    Callback that asks the model for an answer during the training
    """    
    def __init__(self, raw_model_pipeline, ft_model_pipeline) -> None:
        super().__init__()
        self.raw_model_pipeline = raw_model_pipeline
        self.ft_model_pipeline = ft_model_pipeline
    
    def on_epoch_end(self, args, state, control, **kwargs):
        print(test_model_on_one_question(raw_model=self.raw_model_pipeline, ft_model=self.ft_model_pipeline, boosted_data=boosted_data))

In [7]:
def last_layers_fine_tuning(model):

    # trick to speed up training : freeze all layers except the last one
    for name, param in model.named_parameters():
        # print(f"{name}   Modelsize: {param.numel()/1000**2:.1f}M parameters")
        if "15" not in name:
            param.requires_grad = False

    print_trainable_parameters(model)
    return model

In [8]:
from peft import LoraConfig, get_peft_model

def get_peft_config():
    return LoraConfig(
        r=10,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.1,
        bias="lora_only",
        modules_to_save=["decode_head"]
    )

def lora_fine_tuning(model):
    config = get_peft_config()
    lora_model = get_peft_model(model, config)
    return lora_model

In [9]:
from trl import SFTConfig, SFTTrainer

to_train_model = lora_fine_tuning(model) if do_lora else last_layers_fine_tuning(model)

# Initialize trainer
training_args = SFTConfig(
    output_dir="./results",
    # max_steps=100,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    learning_rate=9.2e-4 if do_lora else 3e-5,
    logging_steps=20,
    # save_steps=100,
    # eval_strategy="steps",
    # eval_steps=50,
)

trainer = SFTTrainer(
    model=to_train_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    peft_config=get_peft_config() if do_lora else None,
)

ft_model_pipeline = pipeline("text-generation", model=trainer.model, tokenizer=tokenizer, max_new_tokens=max_new_tokens)

trainer.add_callback(CustomCallback(raw_model_pipeline=raw_model_pipeline, ft_model_pipeline=ft_model_pipeline))

trainable params: 60821504 || all params: 1235814400 || trainable%: 4.92


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Truncating train dataset:   0%|          | 0/117 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/117 [00:00<?, ? examples/s]

Device set to use mps:0


In [10]:
trainer.train()

Step,Training Loss
20,2.0274
40,1.0381
60,0.7844
80,0.6815
100,0.6161
120,0.6349



        question: How would you describe B2SAFE?

        answer_no_fine_tuning : B2SAFE is a medication used to treat bacterial infections, specifically those caused by the bacteria Neisseria gonorrhoeae and Chlamydia trachomatis. It is an injectable antibiotic that works by killing the bacteria and inhibiting

        answer_fine_tuning : B2SAFE is a global, evidence-based, and highly effective, peer-reviewed, and research-based, evidence-based, and evidence-based, safe and highly effective, safe and highly effective, safe and highly effective, safe and highly effective, safe and

        answer_fine_tuning_2 : B2SAFE is a medication used to treat a rare genetic disorder called X-linked adrenoleukodystrophy (ALD).

        ground_truth : I would describe B2SAFE as a robust, safe and highly available service which allows community and departmental repositories to implement data management policies on their research data across multiple administrative domains in a trustworthy manner.


TrainOutput(global_step=120, training_loss=0.963720957438151, metrics={'train_runtime': 65.0185, 'train_samples_per_second': 7.198, 'train_steps_per_second': 1.846, 'total_flos': 289654993944576.0, 'train_loss': 0.963720957438151})

## Test the model

In [11]:
model.eval() # eval mode : stops useless gradient computations

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [12]:
def q_a(question):
    return ft_model_pipeline([{
        "role": "user",
        "content": question
    }])[0]["generated_text"][1]["content"]

In [13]:
for i in range(10):
    print(q_a("What is Juropa ?")) 
    print("--------\n")

Jülich is a research and development center located in Cologne, Germany.
--------

Jülich Research on Petaflop Architectures (Jüropa) was a collaborative project on high-performance computing (HPC) research and development, specifically focused on developing petaflop-class computing architectures.
--------

Jülich is a research and development facility for the European Union, but it is more commonly known as Jülich Research Centre.
--------

Jülich Research on Petaflop Architectures is a research project that focuses on developing supercomputing architectures and applications.
--------

Jülich Research on Petaflop Architectures, a project to develop a new supercomputing architecture called Jülich Research on Petaflop Architectures.
--------

Jülich Research on Petaflop Architectures (Jüropa) is a research project focused on developing a new generation of supercomputers and computing architectures.
--------

Jülich is a research and development center in Germany, and Jülich Research, wh

In [14]:
q_a("What is Juropa in the field of cooking ?")

'In the field of cooking, Juropa is a type of pan or skillet.'