In [None]:
!pip install -U bitsandbytes datasets peft torch transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading bitsandbytes-0.43.3-py3

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive

/content/drive/My Drive


In [None]:
import pandas


input_file = "output_global.csv"

# Read the input file keeping only the relevant fields for the fine-tuning
input_data = pandas.read_csv(
    input_file,
    parse_dates=["DT_NAS"],
    date_format="%d/%m/%Y",
    usecols=[
        # Sex
        "SESSO",
        # Date of birth
        "DT_NAS",
        # City of birth
        "COMUNE NASCITA",
        # City of residence
        "COMUNE_RESIDENZA",
        # First drug to take
        "PRIMO_PROD",
        # Adherence
        "BASSA ADERENZA",
        "INTERMEDIA ADERENZA",
        "ALTA ADERENZA",
        # Follow-up persistence
        "Persistenza di Follow-up",
    ],
)

In [None]:
def summarize_adherence_columns(df):
    if df["BASSA ADERENZA"] == 1:
        return 0
    elif df["INTERMEDIA ADERENZA"] == 1:
        return 1
    elif df["ALTA ADERENZA"] == 1:
        return 2
    else:
        return -1

In [None]:
from datasets import Dataset


# Convert adherence into a single value: 0 for low, 1 for middle, 2 for high, and then remove these columns
input_data["ADERENZA"] = input_data.apply(summarize_adherence_columns, axis=1)
input_data.drop(columns=["BASSA ADERENZA", "INTERMEDIA ADERENZA", "ALTA ADERENZA"], inplace=True)

# Convert the pandas dataframe into a pytorch tensor
dataset = Dataset.from_pandas(input_data)

# Split the dataset into train and eval datasets
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [None]:
from google.colab import userdata
from transformers import AutoTokenizer


# Then create and configure the tokenizer
base_model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="right",
    add_eos_token=True,
    token=userdata.get("HF_TOKEN"),
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
def tokenize_input(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def generate_and_tokenize_prompt(patient):
    full_prompt = f"""<s>[INST]Considering the sex, birth date, birth city, residence city and first drug to take of a
    patient, all information given line by line and formatted as 'label: value', the whole block of lines being enclosed
    by triple single quotes, predict the values for adherence and follow-up persistence.
    Do not consider any information than those provided enclosed by triple single quotes. Your task is to predict one
    value for adherence and one for follow-up persistence based on the patient's information and to output the predicted
    values in the same format.
    Do not absolutely include for any reason any other content, especially input information, in the output.
    '''
    sex: {patient['SESSO']}
    birth date: {patient['DT_NAS']}
    birth city: {patient['COMUNE NASCITA']}
    residence city: {patient['COMUNE_RESIDENZA']}
    first drug to take: {patient['PRIMO_PROD']}
    '''[/INST]
    adherence: {patient['ADERENZA']}
    persistence follow-up: {patient['Persistenza di Follow-up']}
    </s>
    """
    return tokenize_input(full_prompt)

In [None]:
# Prepare our data for the fine-tuning
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_eval_dataset = eval_dataset.map(generate_and_tokenize_prompt)
print(tokenized_train_dataset[0]["input_ids"])

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

[1, 1, 28792, 16289, 28793, 21432, 1184, 288, 272, 3142, 28725, 5950, 3608, 28725, 5950, 2990, 28725, 18016, 2990, 304, 907, 7876, 298, 1388, 302, 264, 13, 2287, 7749, 28725, 544, 1871, 2078, 1407, 486, 1407, 304, 1221, 11985, 390, 464, 1559, 28747, 1192, 647, 272, 2894, 2724, 302, 4715, 1250, 481, 12848, 13, 2287, 486, 22212, 2692, 20759, 28725, 6782, 272, 3069, 354, 616, 663, 636, 304, 1372, 28733, 715, 3708, 10070, 28723, 13, 2287, 2378, 459, 1917, 707, 1871, 821, 1395, 3857, 481, 12848, 486, 22212, 2692, 20759, 28723, 3604, 3638, 349, 298, 6782, 624, 13, 2287, 1192, 354, 616, 663, 636, 304, 624, 354, 1372, 28733, 715, 3708, 10070, 2818, 356, 272, 7749, 28742, 28713, 1871, 304, 298, 3825, 272, 17931, 13, 2287, 3069, 297, 272, 1348, 5032, 28723, 13, 2287, 2378, 459, 7771, 3024, 354, 707, 2611, 707, 799, 3036, 28725, 4012, 2787, 1871, 28725, 297, 272, 3825, 28723, 13, 2287, 23713, 13, 2287, 3142, 28747, 401, 13, 2287, 5950, 3608, 28747, 28705, 28740, 28774, 28774, 28750, 28733, 28734,

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig


# And the model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    token=userdata.get("HF_TOKEN"),
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")
    print(model)

In [None]:
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training


# Operate the fine-tuning
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Uncomment to apply the accelerator
# model = accelerator.prepare_model(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
 

In [None]:
from datetime import datetime

import transformers


if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

run_name = base_model_id + "-" + "patient-data-analyze"
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=4,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        max_steps=750,
        learning_rate=2.5e-4,
        logging_steps=50,
        # bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",  # Directory for storing logs
        save_strategy="steps",  # Save the model checkpoint every logging step
        save_steps=10,  # Save checkpoints every 10 steps
        eval_strategy="steps",  # Evaluate the model every logging step
        eval_steps=50,  # Evaluate and save checkpoints every 50 steps
        do_eval=True,  # Perform evaluation at the end of training
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",  # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train(resume_from_checkpoint=True)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

max_steps is given, it will override any value given in num_train_epochs
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
750,0.026,0.026209




('./mistralai/Mistral-7B-v0.1-patient-data-analyze/tokenizer_config.json',
 './mistralai/Mistral-7B-v0.1-patient-data-analyze/special_tokens_map.json',
 './mistralai/Mistral-7B-v0.1-patient-data-analyze/tokenizer.model',
 './mistralai/Mistral-7B-v0.1-patient-data-analyze/added_tokens.json',
 './mistralai/Mistral-7B-v0.1-patient-data-analyze/tokenizer.json')