In [None]:
import torch

# to change when on datalab
datalab: bool = False # wheter the notebook is running on datalab gcp (True) or locally on mac with mps (False)
resume_from_checkpoint: bool = False # resume from last cp. If True; set checkpoint path to an existing checkpoint
data_prop = .05
do_lora:bool = True # whether to do lora fine tuning or juste last layer fine tuning
date = "24_04_25"


device: torch.device = torch.device("cuda") if datalab else torch.device("mps")
model_name: str = "meta-llama/Llama-3.2-1B-Instruct"
torch_dtype: torch.dtype = torch.bfloat16
max_new_tokens:int  = 50
output_dir: str = f"../bucket/results_{date}" if datalab else "./results"
checkpoint_path: str = "../bucket/results/checkpoint-" if datalab else "/Users/mgg/dev/projets/fine-tuning/cp/checkpoint-11750"
path_dataset: str = "./data/boosted_data.json"

In [2]:
# source : https://colab.research.google.com/drive/1DqKNPOzyMUXmJiJFvJITOahVDxCrA-wA#scrollTo=9Ixtdtpgyv_a

from transformers import AutoModelForCausalLM, AutoTokenizer

# loads generative model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch_dtype)
tokenizer.pad_token = tokenizer.eos_token # add a padding token, otherwise it raises an error

In [3]:
from transformers.pipelines import pipeline
# loads pipeline to keep a view on not fine tuned model

raw_model_pipeline = pipeline("text-generation", model=model_name, tokenizer=tokenizer, max_new_tokens=max_new_tokens)

Device set to use mps:0


## 1 - Loads the training dataset in a hugging face Dataset

In [None]:
import json
from datasets import Dataset
import random

with open(path_dataset, "rt") as f:
    boosted_data = json.load(f)

boosted_data = boosted_data[:int(data_prop*len(boosted_data))]
print(f"Length of dataset : {len(boosted_data)}")

tokenized_conversations = tokenizer.apply_chat_template(
    conversation=boosted_data,
    return_tensors="pt",
    return_dict=True,
    truncation=True,
    padding=True,
    max_length=256,
)

tokenized_conversations["labels"] = tokenized_conversations["input_ids"]

conv_idx_for_test: int = random.randint(0, len(boosted_data)-1) # take one conversation for test
test_conv = boosted_data[conv_idx_for_test]

train_dataset = Dataset.from_dict(tokenized_conversations)

print(f"Example of conversation : {test_conv}")

Length of dataset : 117
Example of conversation : [{'role': 'user', 'content': 'Is B2FIND related to EUDAT?'}, {'role': 'assistant', 'content': 'Yes, B2FIND is a simple, user-friendly metadata catalogue of research data collections stored in EUDAT data centres and other repositories.'}]


In [None]:
# view on dataset

train_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 117
 }),
 {'input_ids': [128000,
   128006,
   9125,
   128007,
   271,
   38766,
   1303,
   33025,
   2696,
   25,
   6790,
   220,
   2366,
   18,
   198,
   15724,
   2696,
   25,
   220,
   1419,
   5186,
   220,
   2366,
   20,
   271,
   128009,
   128006,
   882,
   128007,
   271,
   6854,
   499,
   7124,
   9995,
   12,
   5028,
   1149,
   220,
   17,
   30,
   128009,
   128006,
   78191,
   128007,
   271,
   2181,
   374,
   279,
   9995,
   12,
   5028,
   1149,
   220,
   17,
   5907,
   13,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128

## 2 - Training

In [6]:
from training_tools import print_trainable_parameters, CustomCallback

def last_layers_fine_tuning(model):

    # trick to speed up training : freeze all layers except the last one
    for name, param in model.named_parameters():
        # print(f"{name}   Modelsize: {param.numel()/1000**2:.1f}M parameters")
        if "15" not in name:
            param.requires_grad = False
    return model

In [7]:
from peft import LoraConfig, get_peft_model

def get_peft_config():
    return LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.1,
        bias="lora_only",
        modules_to_save=["decode_head"]
    )

def lora_fine_tuning(model):
    config = get_peft_config()
    lora_model = get_peft_model(model, config)
    return lora_model

In [None]:
from trl import SFTConfig, SFTTrainer

to_train_model = lora_fine_tuning(model) if do_lora else last_layers_fine_tuning(model)
print_trainable_parameters(to_train_model)



# Initialize trainer
training_args = SFTConfig(
    output_dir=output_dir,
    # max_steps=100,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    learning_rate=8e-5 if do_lora else 3e-5,
    logging_steps=20,
    # warmup_ratio=.0 if do_lora else .0,
    # save_steps=100,
    # eval_strategy="steps",
    # eval_steps=50,
)

trainer = SFTTrainer(
    model=to_train_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    peft_config=get_peft_config() if do_lora else None,
)

ft_model_pipeline = pipeline("text-generation", model=trainer.model, tokenizer=tokenizer, max_new_tokens=max_new_tokens)

trainer.add_callback(CustomCallback(raw_model_pipeline=raw_model_pipeline, ft_model_pipeline=ft_model_pipeline, test_conv=test_conv))

trainable params: 11272192 || all params: 1247086592 || trainable%: 0.90


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Truncating train dataset:   0%|          | 0/117 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/117 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Device set to use mps:0
The model 'PeftModel' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Ge

In [9]:
trainer.train()

Step,Training Loss
20,2.2631
40,1.1911
60,0.902
80,0.717
100,0.5503
120,0.5123



        question: Is B2FIND related to EUDAT?

        answer_no_fine_tuning : B2FIND (Bielefeld-Frankfurt Information Network) and EUDAT (European Data Archive Network) are both related to data science and data management, but they serve different purposes and have distinct characteristics.

EUDAT is a European

        answer_fine_tuning : B2FIND and EUDAT are related, but they serve different purposes. B2FIND is an open-source data integration framework, while EUDAT is a comprehensive data integration platform.

        answer_fine_tuning_2 : Yes, B2FIND is related to EUDAT.

        ground_truth : Yes, B2FIND is a simple, user-friendly metadata catalogue of research data collections stored in EUDAT data centres and other repositories.
    

        question: Is B2FIND related to EUDAT?

        answer_no_fine_tuning : Yes, B2FIND and EUDAT (European Data Infrastructure for Social Science Research) are related. 

B2FIND is a European Commission-funded project that aims to create a 

TrainOutput(global_step=120, training_loss=1.0226421117782594, metrics={'train_runtime': 123.6159, 'train_samples_per_second': 3.786, 'train_steps_per_second': 0.971, 'total_flos': 293010139348992.0, 'train_loss': 1.0226421117782594})

## Test the model

In [10]:
model.eval() # eval mode : stops useless gradient computations

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=512, bias=False)
            (lora_dropout): ModuleDict(
              (default):

In [20]:
def q_a(question, max_tokens: int = max_new_tokens):
    return ft_model_pipeline([{
        "role": "user",
        "content": question
    }], max_new_tokens=max_tokens)[0]["generated_text"][1]["content"]

In [17]:
for i in range(10):
    print(q_a("What is Juropa ?")) 
    print("--------\n")

Jülich Research on Petaflop Architectures, or Juropa, is a research project focused on developing a new petaflop-class supercomputing architecture that is scalable, reliable, and easy to use.
--------

Jülich Research on Petaflop Architectures and Applications, also known as Juropa, is a research project on supercomputing architecture and applications, which is a joint project of the Forschungszentrum Jülich and other research institutions.
--------

Jülich Research on Petaflop Architectures is a research project to develop a new generation of supercomputers with exascale performance.
--------

Jülich Research on Petaflop Architectures is a research project focused on developing a new generation of high-performance computing architectures.
--------

Jülich Research on Petaflop Architectures is a collaboration between the Max Planck Institute for Computer Science and the Forschungszentrum Jülich.
--------

Jülich Research on Petaflop Architectures is a research project focused on develo

In [18]:
q_a("What is Juropa in the field of cooking ?") # small check for overfitting

'In the field of cooking, Juropa is a high-performance research computing cluster for scientific simulations.'

In [None]:
q_a("What is Juropa ? In which country does it takes place ?", max_tokens=200) # test with new questions

'Jülich Research on Petaflop Architectures is a research project in Jülich, Germany.'