In [1]:
!pip install transformers datasets gdown evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [14]:
!gdown --folder https://drive.google.com/drive/folders/1CAbb3DjrOPBNm0ozVBfhvrEh9P9rAppc

Retrieving folder contents
Processing file 1e_G-9a66AryHxBOwGWhriePYCCa4_29e subtaskA_dev_monolingual.jsonl
Processing file 123UQ92LxtHaVTbNYlmjnG1CWwD-x7wDL subtaskA_dev_multilingual.jsonl
Processing file 1HeCgnLuDoUHhP-2OsTSSC3FXRLVoI6OG subtaskA_train_monolingual.jsonl
Processing file 13-9-DakCeLFbPgCiVIU0v6_BCQx0ppz6 subtaskA_train_multilingual.jsonl
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1e_G-9a66AryHxBOwGWhriePYCCa4_29e
To: /content/SubtaskA/subtaskA_dev_monolingual.jsonl
100% 10.8M/10.8M [00:00<00:00, 43.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=123UQ92LxtHaVTbNYlmjnG1CWwD-x7wDL
To: /content/SubtaskA/subtaskA_dev_multilingual.jsonl
100% 21.2M/21.2M [00:00<00:00, 75.9MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1HeCgnLuDoUHhP-2OsTSSC3FXRLVoI6OG
From (redirected): https://drive.google.com/uc?id=1HeCgnLuDoUHhP-2OsTSSC3FXRLVoI

Monta conteúdo no drive para gerenciamento de checkpoints, saves e loads

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

save_path = "/content/drive/MyDrive/LLM/Checkpoints_Deberta"
os.makedirs(save_path, exist_ok=True)


In [2]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging


In [3]:
from peft import LoraConfig, get_peft_model, TaskType

In [4]:
from transformers import DebertaTokenizer

Função de pré-processamento do dataset

In [5]:
def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"],
                                  truncation=True,
                                  padding="max_length",
                                  max_length=512,
                                  )

Função de coleta de dados em pandas

In [6]:

def get_data(train_path, test_path, random_seed):
    """
    function to read dataframe with columns
    """

    train_df = pd.read_json(train_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)

    # train_df = train_df[:10000]
    # test_df = test_df[:10000]

    train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)

    return train_df, val_df, test_df

In [None]:
Funçao de tokenização dos datasets de treino e validação

In [7]:
def get_tokenized_data(train_df, valid_df,tokenizer_checkpoint):
    """
    function to read tokenized dataframe with columns
    """
    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)

    # get tokenizer and model from huggingface
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)     # put your model here

    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})


    return tokenized_train_dataset, tokenized_valid_dataset, tokenizer

In [8]:
def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))

    return results

Função de fine tuning com LoRA

In [9]:
def fine_tune(tokenized_train_dataset,
              tokenized_valid_dataset,
              checkpoints_path,
              id2label,
              label2id,
              model,
              tokenizer):

    # 1. Load base model
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,
    )

    # 2. Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=16,                 # LoRA rank (smaller = lighter)
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        # target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]#distilbert
        target_modules=["query_proj", "key_proj", "value_proj"]
    )

    # 3. Wrap the model in PEFT (LoRA)
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()   # <-- useful!

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=checkpoints_path,
        learning_rate=2e-5,   # LoRA can use higher LR
        per_device_train_batch_size=2,            # T4-friendly
        gradient_accumulation_steps=8,            # effective batch = 16
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        weight_decay=0.01,
        optim="adamw_torch_fused",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none",
        fp16=True,            # safe with LoRA, much smaller memory footprint
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save best LoRA adapter
    best_model_path = checkpoints_path + '/best/'
    os.makedirs(best_model_path, exist_ok=True)
    trainer.save_model(best_model_path)


Função auxiliar durante treino (visualizaçao de parametros a serem treinados)

In [10]:
def print_trainable_stats(model):
    total = 0
    trainable = 0
    for n, p in model.named_parameters():
        num = p.numel()
        total += num
        if p.requires_grad:
            trainable += num
    print(f"Total params: {total:,}")
    print(f"Trainable params: {trainable:,}")
    print(f"Trainable %: {100 * trainable / total:.4f}%")

In [11]:
from sklearn.metrics import classification_report

Função de teste de dataset (idealmente loadar modelo de checkpoint criado pela funcao de fine tune)

In [20]:
def test(test_df, adapter_path, id2label, label2id,tokenizer):

    base_model = "microsoft/deberta-v3-base"

    # tokenizer = AutoTokenizer.from_pretrained(base_model)

    # Load base model
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,
    )
    print_trainable_stats(model)
    # Load LoRA adapter on top
    from peft import PeftModel
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()
    print_trainable_stats(model)

    # Tokenize test
    test_dataset = Dataset.from_pandas(test_df)
    tokenized_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        fn_kwargs={"tokenizer": tokenizer},
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir="./tmp",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    predictions = trainer.predict(tokenized_test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    y_true = tokenized_test_dataset["label"]

    print(classification_report(y_true, preds))

    metric = evaluate.load("bstrai/classification_report")
    results = metric.compute(predictions=preds, references=y_true)

    return results, preds


Main (Variáveis globais presentes aqui)

In [15]:
# parser = argparse.ArgumentParser()
# parser.add_argument("--train_file_path", "-tr", required=True, help="Path to the train file.", type=str)
# parser.add_argument("--test_file_path", "-t", required=True, help="Path to the test file.", type=str)
# parser.add_argument("--subtask", "-sb", required=True, help="Subtask (A or B).", type=str, choices=['A', 'B'])
# parser.add_argument("--model", "-m", required=True, help="Transformer to train and test", type=str)
# parser.add_argument("--prediction_file_path", "-p", required=True, help="Path where to save the prediction file.", type=str)

# args = parser.parse_args()

random_seed = 0
train_path =  "SubtaskA/subtaskA_train_monolingual.jsonl" # For example 'subtaskA_train_multilingual.jsonl'
test_path =  "SubtaskA/subtaskA_dev_monolingual.jsonl" # For example 'subtaskA_test_multilingual.jsonl'
model_checkpoint = 'microsoft/deberta-v3-base'
tokenizer_checkpoint = 'microsoft/deberta-v3-base'
subtask =  'A' # For example 'A'
prediction_path = 'prediction.jsonl' # For example subtaskB_predictions.jsonl

if not os.path.exists(train_path):
    logging.error("File doesnt exists: {}".format(train_path))
    raise ValueError("File doesnt exists: {}".format(train_path))

if not os.path.exists(test_path):
    logging.error("File doesnt exists: {}".format(train_path))
    raise ValueError("File doesnt exists: {}".format(train_path))


if subtask == 'A':
    id2label = {0: "human", 1: "machine"}
    label2id = {"human": 0, "machine": 1}

set_seed(random_seed)

#get data for train/dev/test sets
train_df, valid_df, test_df = get_data(train_path, test_path, random_seed)


In [16]:
tokenized_train_df, tokenized_valid_df, tokenizer = get_tokenized_data(train_df, valid_df,tokenizer_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/95805 [00:00<?, ? examples/s]

Map:   0%|          | 0/23952 [00:00<?, ? examples/s]

In [None]:
# train detector model - from pretrained or checkpoint
fine_tune(tokenized_train_df, tokenized_valid_df, f"/drive/MyDrive/Me/LLM/Checkpoints_Deberta/checkpoints/{model_checkpoint}/subtask{subtask}/{random_seed}/best/", id2label, label2id, model_checkpoint, tokenizer)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 886,274 || all params: 185,309,956 || trainable%: 0.4783


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [23]:
# test detector model
results, predictions = test(test_df, f"/content/drive/MyDrive/Mestrado/LLM/Checkpoint_Deberta/checkpoint-5989/", id2label, label2id,tokenizer)

logging.info(results)
predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
predictions_df.to_json(prediction_path, lines=True, orient='records')

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total params: 184,423,682
Trainable params: 184,423,682
Trainable %: 100.0000%
Total params: 184,423,682
Trainable params: 0
Trainable %: 0.0000%


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(


Downloading builder script: 0.00B [00:00, ?B/s]

              precision    recall  f1-score   support

           0       0.65      0.58      0.61      2500
           1       0.62      0.69      0.65      2500

    accuracy                           0.63      5000
   macro avg       0.63      0.63      0.63      5000
weighted avg       0.63      0.63      0.63      5000



Downloading builder script: 0.00B [00:00, ?B/s]

In [25]:
# test detector model
results, predictions = test(test_df, f"/content/drive/MyDrive/Mestrado/LLM/Checkpoint_Deberta/best_0/", id2label, label2id,tokenizer)

logging.info(results)
predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
predictions_df.to_json(prediction_path, lines=True, orient='records')

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total params: 184,423,682
Trainable params: 184,423,682
Trainable %: 100.0000%
Total params: 184,423,682
Trainable params: 0
Trainable %: 0.0000%


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(


              precision    recall  f1-score   support

           0       0.81      0.69      0.75      2500
           1       0.73      0.84      0.78      2500

    accuracy                           0.77      5000
   macro avg       0.77      0.77      0.77      5000
weighted avg       0.77      0.77      0.77      5000



In [None]:
import shutil
import os

src = "/drive/MyDrive/LLM/Checkpoints_Deberta/"
dst = "/content/drive/MyDrive/LLM/Checkpoints_Deberta/"

# create parent directory if needed
os.makedirs(os.path.dirname(dst), exist_ok=True)

# copy the entire folder
shutil.copytree(src, dst, dirs_exist_ok=True)

print("Checkpoint copied successfully!")


Checkpoint copied successfully!
