Usando lib unsloth

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/Colab Notebooks/LLM/finetune_hate_speech/git'

/content/drive/MyDrive/Colab Notebooks/LLM/finetune_hate_speech/git


In [3]:
%pwd

'/content/drive/MyDrive/Colab Notebooks/LLM/finetune_hate_speech/git'

In [4]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes


In [5]:
!pip install triton

Collecting triton
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.0.0


In [7]:
import pandas as pd
import torch
from huggingface_hub import login
from transformers import AutoTokenizer, TrainingArguments
from datasets import Dataset
from trl import SFTTrainer
from unsloth import FastLanguageModel, unsloth_save_model
import os
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Main function
def main(model_dir='./models', output_dir='./outputs',
         validation_percent=5, max_steps=1000,
         learning_rate=2e-4, instruction=None):

    # Hugging Face authentication
    login('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')

    # Define the Alpaca prompt format
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""

    # Function to format the input prompt
    def format_input_prompt(examples):
        texts = [
            alpaca_prompt.format(instruction, inp, out)
            for instruction, inp, out in zip(examples['instruction'], examples['input'], examples['output'])
        ]
        return {"text": texts}

    # Load training data
    train_data_path = '../data/train_toldbr.csv'
    dados_treino = pd.read_csv(train_data_path)

    # Format data for training
    def formatar_para_treinamento(dados, instruction):
        formatted_data = [
            {
                'output': 'sim' if row['Toxic'] == 1 else 'não',
                'input': row['text'],
                'instruction': instruction,
                'text': alpaca_prompt.format(instruction, row['text'], 'sim' if row['Toxic'] == 1 else 'não')
            }
            for _, row in dados.iterrows()
        ]
        return pd.DataFrame(formatted_data)

    # Use default instruction if none provided
    if instruction is None:
        instruction = ("Considere o seguinte texto extraído de uma rede social. Classifique-o como 'sim' "
                       "se contiver discurso de ódio, ofensas, agressões ou qualquer forma de toxicidade, "
                       "levando em conta o uso de gírias e a linguagem informal. Caso contrário, classifique como 'não'")

    # Format the training data
    train_df = formatar_para_treinamento(dados_treino, instruction)

    # Split data into training and validation sets
    validation_count = int(len(train_df) * (validation_percent / 100))
    train_dataset = Dataset.from_pandas(train_df[:-validation_count])
    eval_dataset = Dataset.from_pandas(train_df[-validation_count:])

    # Apply prompt formatting to the datasets
    train_dataset = train_dataset.map(format_input_prompt, batched=True)
    eval_dataset = eval_dataset.map(format_input_prompt, batched=True)

    # Load model and tokenizer with enforced half-precision (float16)
    max_seq_length = 2048
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/llama-3-8b-bnb-4bit",
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=True
    )

    FastLanguageModel.for_inference(model)

    # Configure model for fine-tuning with PEFT
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing=True,
        random_state=42,
        max_seq_length=max_seq_length
    )

    # Set up Trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=10,
            max_steps=max_steps,
            learning_rate=learning_rate,
            bf16=torch.cuda.is_bf16_supported(),  # Enable bf16 if supported
            logging_steps=100,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=1234,
            output_dir=output_dir
        )
    )

    # Train the model
    trainer_stats = trainer.train()

    # Create directories if they don't exist
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    # Save the model and tokenizer
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)

    # Save the model in UnsLOTH format
    unsloth_save_model(model, tokenizer, os.path.join(model_dir, "unsloth_model"))


# Example instruction
instruction_GA = ("Considere o seguinte texto extraído de uma rede social. Classifique-o como 'sim' se contiver "
                  "discurso de ódio, ofensas, agressões ou qualquer forma de toxicidade, levando em conta o uso de gírias "
                  "e a linguagem informal. Caso contrário, classifique como 'não':")

# Run the code with defined parameters
main(model_dir='./models/llama_31_FT_r_16_alpha_16', validation_percent=5, max_steps=2500, learning_rate=2e-4, instruction=instruction_GA)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Map:   0%|          | 0/15818 [00:00<?, ? examples/s]

Map:   0%|          | 0/832 [00:00<?, ? examples/s]

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/15818 [00:00<?, ? examples/s]

Map:   0%|          | 0/832 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15,818 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2,500
 "-____-"     Number of trainable parameters = 41,943,040
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
100,0.959
200,0.6709
300,0.6521
400,0.666
500,0.6588
600,0.6361
700,0.6378
800,0.6413
900,0.65
1000,0.6577


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... Done.
