### Infrastructure config

In [None]:
import torch
from typing import Literal
from datetime import datetime
import os

which_infra:Literal["onyxia", "datalab_gcp", "local"] = os.environ["WHICH_INFRA"] if "WHICH_INFRA" in os.environ else "local"


device: torch.device = torch.device("cpu") # default device to cpu
date = datetime.now().strftime("%m_%d_%Y-%Hh_%Mmin")

match which_infra:
    case "local":
        device = torch.device("mps")
        output_dir = f"../bucket/models/results_{date}"
        train_dataset_dir = "../bucket/data/train_dataset.json"
    case "datalab_gcp":
        device = torch.device("cuda")
        output_dir = f"../../bucket/models/results_{date}"
        train_dataset_dir = "../../bucket/data/train_dataset.json"
    case "onyxia":
        device = torch.device("cuda")
        output_dir = f"../../bucket/models/results_{date}" # todo: look how to access onyxia s3 buckets
        train_dataset_dir = "../data/train_dataset.json"
    case _:
        raise ValueError(f"Unexpected value for environment variable WHICH_INFRA. Accepted values are : 'onyxia', 'datalab_gcp' and 'local'.")

train_dataset_dir = "../example_data/train_dataset.json"

print(f"""
    Date : {date},
    Running on : {which_infra},
    Device : {device},
    Loading data from : {train_dataset_dir},
    Saving models to : {output_dir}
""")


### Training config

In [None]:
import torch
import os

# [OPTIONAL] to start training from an old checkpoint
checkpoint_path: str | None = None
if (checkpoint_path is not None and not os.path.isdir(checkpoint_path)) :
    raise ValueError(f"To start from a checkpoint, please set a valid path to checkpoint_path variable.")

# Training parameters

model_name: str = "meta-llama/Llama-3.1-8B-Instruct" # ⚠️ requires hugging face auth
# model_name: str = "microsoft/Phi-3-mini-4k-instruct"

do_lora:bool = True # whether to do lora fine tuning or juste last layer fine tuning
torch_dtype: torch.dtype = torch.bfloat16
max_new_tokens:int  = 100 # max token when model is used for text generation through hugging face pipeline
data_prop = 1 # proportion of data to be used for training

print(f"""
    Pre-trained model : {model_name},
    Dtype of model weights : {torch_dtype},
    Is loading from checkpoint : {checkpoint_path if checkpoint_path is not None else "No"},
    Fine tune method : {'LoRA' if do_lora else {'Last Layer fine tuning'}},
""")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# loads generative model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch_dtype)
tokenizer.pad_token = tokenizer.eos_token # add a padding token, otherwise it raises an error

In [None]:
from transformers.pipelines import pipeline

# loads pipeline to keep a view on not fine tuned model

# raw_model_pipeline = pipeline("text-generation", model=model_name, tokenizer=tokenizer, max_new_tokens=max_new_tokens)

## 1 - Loads the training dataset in a hugging face Dataset

In [None]:
import json
from datasets import Dataset
import random

with open(train_dataset_dir, "rt") as f:
    train_dataset = json.load(f)

train_dataset = train_dataset[:int(data_prop*len(train_dataset))]
print(f"Number of acronyms : {len(train_dataset)}")


all_convs = []
for each_acro in train_dataset:
    for each_conv in each_acro["conversation"]:
        all_convs.append(each_conv)



tokenized_conversations = tokenizer.apply_chat_template(
    conversation=all_convs,
    return_tensors="pt",
    return_dict=True,
    truncation=True,
    padding=True,
    max_length=256,
)

tokenized_conversations["labels"] = tokenized_conversations["input_ids"]

conv_idx_for_test: int = random.randint(0, len(train_dataset)-1) # take one conversation for test
test_conv = train_dataset[conv_idx_for_test]


train_dataset = Dataset.from_dict(tokenized_conversations)

print(f"Example of conversation : {test_conv}")

In [None]:
# view on dataset

train_dataset

## 2 - Training

In [None]:
from training_tools import print_trainable_parameters, CustomCallback

def last_layers_fine_tuning(model):

    # trick to speed up training : freeze all layers except the last one
    for name, param in model.named_parameters():
        # print(f"{name}   Modelsize: {param.numel()/1000**2:.1f}M parameters")
        if "15" not in name:
            param.requires_grad = False
    return model

In [None]:
from peft import LoraConfig, get_peft_model

def get_peft_config():
    return LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.1,
        bias="lora_only",
        modules_to_save=["decode_head"]
    )

def lora_fine_tuning(model):
    config = get_peft_config()
    lora_model = get_peft_model(model, config)
    return lora_model

In [None]:
from trl import SFTConfig, SFTTrainer

to_train_model = lora_fine_tuning(model) if do_lora else last_layers_fine_tuning(model)
print_trainable_parameters(to_train_model)



# Initialize trainer
training_args = SFTConfig(
    output_dir=output_dir,
    # max_steps=100,
    num_train_epochs=4 if data_prop < .1 else 30,
    learning_rate=8e-5 if do_lora else 1e-3,
    per_device_train_batch_size=1, # it seems that with a batch size greater than 1, weights are updated with the average gradient loss over
    # all the batch, hence the model could not be updated with the information about a particular element of the dataset.
    # For our usecase, batch size of 1 is better  https://discuss.pytorch.org/t/how-sgd-works-in-pytorch/8060
    logging_steps=50, # doc about what is step vs batch : https://discuss.huggingface.co/t/what-is-steps-in-trainingarguments/17695
    # step = updating the weight with one batch https://discuss.huggingface.co/t/what-is-the-meaning-of-steps-parameters/56411
    # warmup_ratio=.0 if do_lora else .0,
    # save_steps=100,
    # eval_strategy="steps",
    # eval_steps=50,
)

trainer = SFTTrainer(
    model=to_train_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    peft_config=get_peft_config() if do_lora else None,
)

# ft_model_pipeline = pipeline("text-generation", model=trainer.model, tokenizer=tokenizer, max_new_tokens=max_new_tokens)

# cust_callback = CustomCallback(raw_model_pipeline=raw_model_pipeline, ft_model_pipeline=ft_model_pipeline, test_conv=test_conv["conversation"][0])
# trainer.add_callback(cust_callback)

In [None]:
trainer.train(resume_from_checkpoint=checkpoint_path)

## 3 - Hot evaluation

We try the model just after the training to have a restricted overview of its performance. See [03-test](../03-test/) for more detailed noteboooks.

In [None]:
model.eval() # eval mode : stops useless gradient computations

In [None]:
ft_model_pipeline = pipeline("text-generation", model=trainer.model, tokenizer=tokenizer, max_new_tokens=max_new_tokens, do_sample=True)

In [None]:
def q_a(question, max_tokens: int = max_new_tokens):
    return ft_model_pipeline([{
        "role": "user",
        "content": question
    }], max_new_tokens=max_tokens)[0]["generated_text"][1]["content"]

In [None]:
for i in range(10):
    print(q_a("What is TOAST ?")) 
    print("--------\n")

In [None]:
q_a("What is TOAST in the field of astronomy ?") # small check for overfitting

In [None]:
q_a("What is TOAST ? ", max_tokens=200) # test with new questions

In [None]:
trainer.save_model(os.path.join(output_dir, "final_model")) # saves the model