# Import Libraries

In [1]:
import re
import torch
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm
from unsloth import FastLanguageModel
from sklearn.model_selection import train_test_split
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-27 18:34:38 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-27 18:34:39 [__init__.py:239] Automatically detected platform cuda.


2025-07-27 18:34:39,701	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
max_seq_length = 2048
dtype = None # None for auto detection.
load_in_4bit = True # 4bit quantization to reduce memory usage. 

model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"

data_path = "./data/en_train_data_SMM4H_2025_clean.csv"

label_map = {
    0: "without Adverse Drug Events",
    1: "with Adverse Drug Events"
}

label2id = {v:k for k,v in label_map.items()}

# Load LLAMA 3.2 1B Model and Tokenizer

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.6.8: Fast Mistral patching. Transformers: 4.53.0. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.747 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Add LoRA Adapters

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.6.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Load Original Training Data and Merge with New Data

In [5]:
train = pd.read_csv(data_path)
train, _ = train_test_split(train, test_size=0.2, random_state=20)

train["instruction"] = "Classify this example tweet into two topics: with Adverse Drug Events and without. Adverse Drug Events are negative medical side effects associated with a drug"
train["label"] = train["label"].map(label_map)
train = train.rename(columns={"label": "output", "text": "input"})

tmp_dataset_path = Path('./tmp')
if not tmp_dataset_path.exists():
    os.mkdir(tmp_dataset_path.absolute())
    
train.to_csv(f"{tmp_dataset_path.absolute()}/train_updated.csv", index=False)

dataset = load_dataset("csv", data_files=f"{tmp_dataset_path.absolute()}/train_updated.csv", split="train")

Generating train split: 14379 examples [00:00, 322238.83 examples/s]


# Prepare Data

In [6]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 14379/14379 [00:00<00:00, 240640.87 examples/s]


# Setup Trainer

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        max_steps = 300, #642,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = f"./results/{model_name.replace('/', '-')}_checkpoints",
        report_to = "none"
    ),
)

Unsloth: Tokenizing ["text"]: 100%|██████████| 14379/14379 [00:00<00:00, 23821.30 examples/s]


# Show current memory stats

In [8]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4070 Laptop GPU. Max memory = 7.747 GB.
4.051 GB of memory reserved.


# Start Training

In [9]:
trainer_stats = trainer.train()

# Save Model (Just LoRA Adapters) and Tokenzer
model.save_pretrained(f"./results/{model_name.replace('/', '-')}")
tokenizer.save_pretrained(f"./results/{model_name.replace('/', '-')}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 14,379 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.8602
2,2.8292
3,2.5567
4,2.237
5,1.8009
6,1.4234
7,1.1612
8,1.0459
9,1.0314
10,0.8687


('./results/unsloth-mistral-7b-instruct-v0.3-bnb-4bit/tokenizer_config.json',
 './results/unsloth-mistral-7b-instruct-v0.3-bnb-4bit/special_tokens_map.json',
 './results/unsloth-mistral-7b-instruct-v0.3-bnb-4bit/chat_template.jinja',
 './results/unsloth-mistral-7b-instruct-v0.3-bnb-4bit/tokenizer.model',
 './results/unsloth-mistral-7b-instruct-v0.3-bnb-4bit/added_tokens.json',
 './results/unsloth-mistral-7b-instruct-v0.3-bnb-4bit/tokenizer.json')

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Load the Saved Model and Tokenizer

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = f"results/{model_name.replace('/', '-')}",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2025.6.8: Fast Mistral patching. Transformers: 4.53.0. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.747 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.6.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Make Predictions on Comptetion Test Set

In [4]:
train = pd.read_csv(data_path)
_, val = train_test_split(train, test_size=0.2, random_state=20)
public_set = val
public_set

Unnamed: 0,text,label
17933,I go back in 2 weeks and am determined to lose...,0
11741,Got my sceipts filled. Thank god for mental he...,0
17076,who needs adderall when you have chocolate cov...,0
2768,"breathe right en el tabique. agua salina, flu...",0
6562,@USER___________ @USER___________ t take avoda...,0
...,...,...
1017,@USER___ why so many drug ads during #nightlyn...,0
950,i wish i had cymbalta !,0
384,The verdict on lemon tea? Tastes like a throat...,0
15943,“@USER_________: Adderall had me beating Skyri...,0


In [5]:
FastLanguageModel.for_inference(model)

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

public_set["instruction"] = "Classify this example tweet into two topics: with Adverse Drug Events and without. Adverse Drug Events are negative medical side effects associated with a drug"
public_set.rename(columns = {"text": "input"}, inplace=True)

raw_outputs = []
for i in tqdm(range(len(public_set))):
  inputs = tokenizer(
  [
      prompt.format(
          public_set.iloc[0]["instruction"], 
          public_set.iloc[i]["input"], 
          "",
      )
  ], return_tensors = "pt", truncation = True, max_length = 2048).to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  raw_outputs.append(tokenizer.batch_decode(outputs))

  0%|          | 0/3595 [00:00<?, ?it/s]MistralForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.
100%|██████████| 3595/3595 [17:27<00:00,  3.43it/s]


In [6]:
def parse_output(output):
    # re_match = re.search(r'### Response:\n(.*?)<\|end▁of▁sentence\|>', output, re.DOTALL)
    re_match = re.search(r'### Response:\n(.*?)<\/s>', output, re.DOTALL)
    if re_match:
        response = re_match.group(1).strip()
        return response
    else:
        return ''

In [7]:
public_set["raw_outputs"] = [raw_output[0] for raw_output in raw_outputs]
public_set["parsed_outputs"] = public_set["raw_outputs"].apply(parse_output)
public_set["predicted_label"] = public_set["parsed_outputs"].map(label2id)

In [8]:
print(public_set["raw_outputs"].iloc[1])
print()
print(public_set["parsed_outputs"].iloc[1])
print()
print(public_set["predicted_label"].iloc[1])

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Classify this example tweet into two topics: with Adverse Drug Events and without. Adverse Drug Events are negative medical side effects associated with a drug

### Input:
Got my sceipts filled. Thank god for mental health #xannies #prozac

### Response:
without Adverse Drug Events</s>

without Adverse Drug Events

0


# Make an inference results dump

In [11]:
import numpy as np
from sklearn.metrics import (f1_score, 
                             confusion_matrix, 
                             classification_report)

In [12]:
'''
Функция принимает словарь metics, в его составе обязательно должны быть 
accuracy, micro_f1 и выход функции classification_report пакета sklearn.
Причем выход функции classification_report должен представлять собой словарь, так что,
убедитесь, что в месте вызова функции classification_report установлен параметр output_dict=True
'''
def dump_classification_metrics(model_name, metrics, csv_file=None, use_generation=False):
    assert('accuracy' in metrics.keys())
    assert('micro_f1' in metrics.keys())
    assert('classification_report' in metrics.keys())
    
    assert(csv_file is not None)
    
    classification_rep = metrics['classification_report']
    
    new_row = {
        'model_name':        [model_name],
        'accuracy':          [metrics['accuracy']],
        'micro_f1':          [metrics['micro_f1']],
        'label_0_precision': [classification_rep['0']['precision']],
        'label_0_recall':    [classification_rep['0']['recall']],
        'label_0_f1':        [classification_rep['0']['f1-score']],
        'label_1_precision': [classification_rep['1']['precision']],
        'label_1_recall':    [classification_rep['1']['recall']],
        'label_1_f1':        [classification_rep['1']['f1-score']],
        'use_generation':    [int(use_generation)]
    }
    
    new_row_df = pd.DataFrame(new_row)
    
    try:
        existing_df = pd.read_csv(csv_file)
        new_row_df.to_csv(csv_file, mode='a', index=False, header=False)
    except FileNotFoundError:
        # Если файла нет, создаем его с заголовками
        new_row_df.to_csv(csv_file, index=False, header=True)
        
def get_metrics(preds, true_lables):
    cm = confusion_matrix(true_lables, preds)
    report = classification_report(true_lables, preds, output_dict=True)
    accuracy = np.sum(np.diag(cm)) / np.sum(cm)
    # Вычисление взвешенной F1-меры для текущей модели
    micro_f1 = f1_score(true_lables, preds, average='micro')
    return cm, report, accuracy, micro_f1

In [13]:
DUMP_METRICS_FILEPATH = 'evaluation_report.csv'

true_labels = public_set['label'] 
preds = public_set["predicted_label"]
 
print(f"EVALUATE MODEL {model_name}")
cm, validation_report, accuracy, micro_f1 = get_metrics(preds, true_labels)

metrics = {
    'accuracy': accuracy,
    'micro_f1': micro_f1,
    'classification_report': validation_report
}
dump_classification_metrics(model_name, metrics, csv_file=DUMP_METRICS_FILEPATH, use_generation=False)

EVALUATE MODEL unsloth/mistral-7b-instruct-v0.3-bnb-4bit
