In [None]:
from IPython.display import clear_output
huggingface_cli_token = input("INSERT YOUR huggingface-cli_token:")
clear_output()

In [None]:
!huggingface-cli login --token $huggingface_cli_token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `tirocinio2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `tirocinio2`


In [None]:
!pip install -q datasets --progress-bar off

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [None]:
import pandas as pd
import pickle

In [None]:
experiment_name = "fine-tuning-output-global"

In [None]:
from google.colab import drive
from IPython.display import Image, display

mount_point = "/content/drive"
base_path = mount_point + "/MyDrive"

input_data_path = base_path + "/V3_PDD_merged.csv"

output_data_path = base_path + experiment_name
log_filename = base_path + experiment_name + ".log"

drive.mount(mount_point, force_remount=True)

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer


In [None]:
base_model = "mistralai/Mistral-7B-Instruct-v0.3"
new_model = "Mistral-7B-Instruct-v0.3-fine-tuning-global-PDD-followUpBased"

In [None]:
data = data = pd.read_csv(input_data_path,
        parse_dates=["DT_NAS"],
        date_format="%d/%m/%Y",
        usecols=[
            # Sex
            "SESSO",
            # Date of birth
            "DT_NAS",
            # City of birth
            "COMUNE NASCITA",
            # City of residence
            "COMUNE_RESIDENZA",
            # First drug to take
            "PRIMO_PROD",
            # Shift
            "SHIFT",
            # Follow-up persistence
            "Persistenza di Follow-up",
        ],)


data.head(3)

Unnamed: 0,SESSO,DT_NAS,COMUNE NASCITA,COMUNE_RESIDENZA,SHIFT,Persistenza di Follow-up,PRIMO_PROD
0,F,1966-10-01,ASSEMINI,ASSEMINI,2,1,1480002
1,F,1963-02-11,CAGLIARI,QUARTUCCIU,1,1,1480022
2,F,1976-12-27,CAGLIARI,CAGLIARI,1,0,1480022


In [None]:
data.shape[0]

162

In [None]:
def apply_fine_tuning_template(data):


  template = f"""<s>[INST]Considering the sex, birth date, birth city, residence city, first drug to take and shift of a
    patient, all information given line by line and formatted as 'label: value', the whole block of lines being enclosed
    by triple single quotes, predict the value of the follow-up persistence.
    Do not consider any information than those provided enclosed by triple single quotes. Your task is to predict the value of the follow-up persistence based on the patient's information and to output the predicted
    value in the same format.
    Do not absolutely include for any reason any other content, especially input information, in the output.
    '''
    sex: {data['SESSO']}
    birth date: {data['DT_NAS']}
    birth city: {data['COMUNE NASCITA']}
    residence city: {data['COMUNE_RESIDENZA']}
    first drug to take: {data['PRIMO_PROD']}
    shift: {data['SHIFT']}
    '''[/INST]
    persistence follow-up: {data['Persistenza di Follow-up']}
    </s>
    """

  return template


In [None]:
data['fine_tuning_prompt'] = data.apply(apply_fine_tuning_template, axis=1)

In [None]:
data.head(3)

Unnamed: 0,SESSO,DT_NAS,COMUNE NASCITA,COMUNE_RESIDENZA,SHIFT,Persistenza di Follow-up,PRIMO_PROD,fine_tuning_prompt
0,F,1966-10-01,ASSEMINI,ASSEMINI,2,1,1480002,"<s>[INST]Considering the sex, birth date, birt..."
1,F,1963-02-11,CAGLIARI,QUARTUCCIU,1,1,1480022,"<s>[INST]Considering the sex, birth date, birt..."
2,F,1976-12-27,CAGLIARI,CAGLIARI,1,0,1480022,"<s>[INST]Considering the sex, birth date, birt..."


In [None]:
data.iloc[0]['fine_tuning_prompt']

"<s>[INST]Considering the sex, birth date, birth city, residence city, first drug to take and shift of a\n    patient, all information given line by line and formatted as 'label: value', the whole block of lines being enclosed\n    by triple single quotes, predict the value of the follow-up persistence.\n    Do not consider any information than those provided enclosed by triple single quotes. Your task is to predict the value of the follow-up persistence based on the patient's information and to output the predicted\n    value in the same format.\n    Do not absolutely include for any reason any other content, especially input information, in the output.\n    '''\n    sex: F\n    birth date: 1966-10-01 00:00:00\n    birth city: ASSEMINI\n    residence city: ASSEMINI\n    first drug to take: 1480002\n    shift: 2\n    '''[/INST]\n    persistence follow-up: 1\n    </s>\n    "

In [None]:
from datasets import Dataset

# Funzione per convertire il DataFrame in un formato compatibile con load_dataset
def convert_to_dataset(df):
    dataset_dict = {}
    for column in df.columns:
        dataset_dict[column] = df[column].tolist()

    dataset = Dataset.from_dict(dataset_dict)
    return dataset

In [None]:
dataset = convert_to_dataset(data[['fine_tuning_prompt']])

In [None]:
dataset

Dataset({
    features: ['fine_tuning_prompt'],
    num_rows: 162
})

In [None]:
# Load base model(Zephyr-7B 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        #load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

(True, True)

In [None]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'base_layer', 'down_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=100,
    per_device_train_batch_size=14, #4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=1000,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    #report_to="wandb"
)


In [None]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= 2048,
    dataset_text_field="fine_tuning_prompt",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/162 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

# Save the fine-tuned model
trainer.model.save_pretrained("/content/drive/MyDrive/" + new_model)
# wandb.finish()
model.config.use_cache = True
model.eval()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  return fn(*args, **kwargs)


Step,Training Loss
25,0.7736
50,0.1762
75,0.1463
100,0.1422
125,0.1383
150,0.1335
175,0.1272
200,0.1218
225,0.1153
250,0.1087


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): MistralRotaryEmbedding()
            )
            (mlp): MistralMLP(
              (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (up_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)
                (lora_dropout): ModuleDict(
                  (def