In [18]:
import pandas as pd
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model
from sklearn.metrics import classification_report


ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath("")), "..", "..")
)
DATA_DIR = os.path.abspath(os.path.join(ROOT, "dados"))
MODEL_DIR = os.path.abspath(os.path.join(ROOT, "modelos"))

dev_file_path = os.path.join(DATA_DIR, "detect", "subtaskA_dev_monolingual.jsonl")
train_file_path = os.path.join(DATA_DIR, "detect", "subtaskA_train_monolingual.jsonl")
df_train = pd.read_json(train_file_path, lines=True)
df_dev = pd.read_json(dev_file_path, lines=True)

df_train = pd.read_json(train_file_path, lines=True)
df_train['length'] = df_train['text'].str.len()
display(train.head())

Unnamed: 0,text,label,model,source,id
0,Forza Motorsport is a popular racing game that...,1,chatGPT,wikihow,0
1,Buying Virtual Console games for your Nintendo...,1,chatGPT,wikihow,1
2,Windows NT 4.0 was a popular operating system ...,1,chatGPT,wikihow,2
3,How to Make Perfume\n\nPerfume is a great way ...,1,chatGPT,wikihow,3
4,How to Convert Song Lyrics to a Song'\n\nConve...,1,chatGPT,wikihow,4


In [35]:
df_train[df_train["length"]==df_train.length.max()]["text"]

74129    This is a list of fictional characters from th...
Name: text, dtype: object

In [40]:
sum(df_train.length)/119757

2786.013519042728

In [8]:

# ============================================
# 1Ô∏è‚É£ PREPROCESS DATA (Adaptado para Human vs. AI)
# ============================================

# Fun√ß√£o para criar o r√≥tulo bin√°rio
def map_label(model_name):
    if model_name == 'human':
        return 'human'
    else:
        return 'AI' # Agrupa 'chatGPT', 'davinci', etc.

# Aplica a fun√ß√£o para criar uma nova coluna de r√≥tulo leg√≠vel
df_train['label_str'] = df_train['model'].apply(map_label)
df_dev['label_str'] = df_dev['model'].apply(map_label)

# Mapeia os r√≥tulos de string para inteiros
label_map = {"human": 0, "AI": 1}
df_train["label_id"] = df_train["label_str"].map(label_map)
df_dev["label_id"] = df_dev["label_str"].map(label_map)

print("Label distribution (train):")
print(df_train["label_str"].value_counts())
print("\nLabel distribution (dev):")
print(df_dev["label_str"].value_counts())

# ============================================
# 2Ô∏è‚É£ DEFINE DATASET CLASS (Adaptado)
# ============================================

# class TextDataset(Dataset):
#     def __init__(self, df, tokenizer, max_len=128):
#         # MUDAN√áA: "sentence" -> "text"
#         self.texts = df["text"].tolist()
#         # MUDAN√áA: "label" -> "label_id" (nossa nova coluna)
#         self.labels = df["label_id"].tolist()
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = str(self.texts[idx]) # Adicionado str() para seguran√ßa
#         label = self.labels[idx]
#         enc = self.tokenizer(
#             text,
#             truncation=True,
#             padding="max_length",
#             max_length=self.max_len,
#             return_tensors="pt",
#         )
#         item = {key: val.squeeze() for key, val in enc.items()}
#         item["labels"] = torch.tensor(label, dtype=torch.long)
#         return item

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df["label_id"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # MUDAN√áA: Removido padding, return_tensors.
        # Deixe o DataCollator do Trainer cuidar disso.
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            # N√£o adicione padding aqui!
        )

        # Retorne um dicion√°rio simples.
        # O Trainer cuidar√° de agrupar e converter para tensores.
        item = {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "labels": label
        }
        return item

# ============================================
# 3Ô∏è‚É£ LOAD PRETRAINED MODEL + TOKENIZER (Adaptado)
# ============================================

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2, # Ainda √© classifica√ß√£o bin√°ria
    # MUDAN√áA: Atualiza os r√≥tulos para a nova tarefa
    id2label={0: "human", 1: "AI"},
    label2id={"human": 0, "AI": 1}
)

# ============================================
# 4Ô∏è‚É£ ADD LORA ADAPTER (Sem mudan√ßas)
# ============================================

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# ============================================
# 5Ô∏è‚É£ CREATE DATASETS (Adaptado)
# ============================================

# Passa os DataFrames corretos
train_dataset = TextDataset(df_train, tokenizer)
dev_dataset = TextDataset(df_dev, tokenizer)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Dev dataset size: {len(dev_dataset)}")

# ============================================
# 6Ô∏è‚É£ DEFINE METRICS (Sem mudan√ßas)
# ============================================

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# ============================================
# 7Ô∏è‚É£ TRAINING ARGUMENTS (Sem mudan√ßas, mantendo a corre√ß√£o do log)
# ============================================
# ============================================
# 7Ô∏è‚É£ TRAINING ARGUMENTS (Adaptado para dataset GRANDE)
# ============================================

# Calcule o total de passos de treino (ex: para 1 √©poca com batch 32)
# 119757 / 32 = 3742.4 => 3743 passos
# Vamos definir nossos logs/evals com base nisso. ex: a cada 1000 passos.

training_args = TrainingArguments(
    output_dir="./lora_results_human_ai",
    dataloader_num_workers=2,
    # --- Mudan√ßas Estrat√©gicas ---
    num_train_epochs=1, # 120k amostras √© muito. 1 √©poca √© um √≥timo ponto de partida.

    # Aumente o batch size para acelerar o treino (depende da sua VRAM)
    per_device_train_batch_size=16,  # Era 16. Tente 128 ou 128.
    per_device_eval_batch_size=16,   # Pode ser igual ou maior que o de treino.

    # Mude TUDO de "epoch" para "steps"
    eval_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",

    # Defina a frequ√™ncia em passos
    eval_steps=1000,     # Avalia no dataset de dev a cada 1000 passos
    logging_steps=1000,  # Mostra o training loss a cada 1000 passos
    save_steps=1000,     # Salva um checkpoint a cada 1000 passos

    # --- O resto pode ficar igual ---
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_dir="./logs_human_ai",
    load_best_model_at_end=True, # ESSENCIAL: vai carregar o melhor checkpoint (ex: passo 3000) no final
    metric_for_best_model="accuracy",
    report_to="none",
)

# =G==========================================
# 8Ô∏è‚É£ TRAINER SETUP (Sem mudan√ßas)
# ============================================

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator, # <-- 3. PASSE-O PARA O TRAINER
)

# ============================================
# 9Ô∏è‚É£ TRAIN MODEL (Sem mudan√ßas)
# ============================================

trainer.train()

# ============================================
# üîü EVALUATE MODEL (Adaptado)
# ============================================

preds_output = trainer.predict(dev_dataset)
y_true = preds_output.label_ids
y_pred = np.argmax(preds_output.predictions, axis=1)

# MUDAN√áA: Atualiza o mapa inverso
inv_label_map = {0: "human", 1: "AI"}
y_true_labels = [inv_label_map[i] for i in y_true]
y_pred_labels = [inv_label_map[i] for i in y_pred]

print("\n--- Classification Report (Human vs. AI) ---")
print(classification_report(y_true_labels, y_pred_labels))

Label distribution (train):
label_str
human    63351
AI       56406
Name: count, dtype: int64

Label distribution (dev):
label_str
AI       2500
human    2500
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700
Train dataset size: 119757
Dev dataset size: 5000


Step,Training Loss,Validation Loss,Accuracy
1000,0.3712,1.010674,0.6362
2000,0.2596,0.964533,0.6616
3000,0.2457,0.95975,0.6804
4000,0.2212,0.910559,0.6884
5000,0.2132,0.84586,0.688
6000,0.2051,0.834596,0.7108
7000,0.201,0.798629,0.7158



--- Classification Report (Human vs. AI) ---
              precision    recall  f1-score   support

          AI       0.72      0.71      0.71      2500
       human       0.71      0.72      0.72      2500

    accuracy                           0.72      5000
   macro avg       0.72      0.72      0.72      5000
weighted avg       0.72      0.72      0.72      5000



In [11]:
from google.colab import drive
import os

# --- 2. Defina um caminho PERMANENTE ---
# Crie um diret√≥rio dentro do seu Drive para este modelo
output_dir = "/content/drive/MyDrive/Mestrado/Modelos de Linguagem/Trabalho LLM/Tarefa Deteccao semeval 2024-8/human_ai_lora_final"

# Crie o diret√≥rio se ele n√£o existir
os.makedirs(output_dir, exist_ok=True)
print(f"Diret√≥rio de salvamento: {output_dir}")

# ============================================
# 11. SALVAR O MODELO FINAL
# ============================================

print(f"Salvando adaptadores LoRA em {output_dir}...")

# 1. Salve os pesos do adaptador PEFT (LoRA)
model.save_pretrained(output_dir)

# 2. Salve o tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Modelo e tokenizer salvos permanentemente em seu Google Drive!")

# Verifique os arquivos salvos no seu Drive
!ls -lh {output_dir}

Diret√≥rio de salvamento: /content/drive/MyDrive/Mestrado/Modelos de Linguagem/Trabalho LLM/Tarefa Deteccao semeval 2024-8/human_ai_lora_final
Salvando adaptadores LoRA em /content/drive/MyDrive/Mestrado/Modelos de Linguagem/Trabalho LLM/Tarefa Deteccao semeval 2024-8/human_ai_lora_final...
Modelo e tokenizer salvos permanentemente em seu Google Drive!
ls: cannot access '/content/drive/MyDrive/Mestrado/Modelos': No such file or directory
ls: cannot access 'de': No such file or directory
ls: cannot access 'Linguagem/Trabalho': No such file or directory
ls: cannot access 'LLM/Tarefa': No such file or directory
ls: cannot access 'Deteccao': No such file or directory
ls: cannot access 'semeval': No such file or directory
ls: cannot access '2024-8/human_ai_lora_final': No such file or directory


In [12]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel
from google.colab import drive

# --- 1. Monte seu Google Drive ---
# print("Montando Google Drive...")
# drive.mount('/content/drive')

# ============================================
# 12. CARREGAR O MODELO SALVO DO DRIVE
# ============================================

# --- 2. Defina os caminhos ---
base_model_name = "bert-base-uncased"
# O caminho exato onde voc√™ salvou os adaptadores no seu Drive
adapter_dir = "/content/drive/MyDrive/Mestrado/Modelos de Linguagem/Trabalho LLM/Tarefa Deteccao semeval 2024-8/human_ai_lora_final"

print(f"Carregando o modelo base: {base_model_name}...")

# --- 3. Carregue o Modelo Base ---
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2,
    id2label={0: "human", 1: "AI"},
    label2id={"human": 0, "AI": 1}
)

print(f"Carregando e aplicando adaptadores LoRA de: {adapter_dir}...")

# --- 4. Aplique os Adaptadores LoRA ---
model = PeftModel.from_pretrained(base_model, adapter_dir)

# --- 5. Carregue o Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(adapter_dir)

print("Modelo e tokenizer carregados do Google Drive!")

# --- 6. Prepare o modelo para infer√™ncia ---
model.eval()
if torch.cuda.is_available():
    model.to("cuda")
    print("Modelo movido para a GPU.")

Carregando o modelo base: bert-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Carregando e aplicando adaptadores LoRA de: /content/drive/MyDrive/Mestrado/Modelos de Linguagem/Trabalho LLM/Tarefa Deteccao semeval 2024-8/human_ai_lora_final...
Modelo e tokenizer carregados do Google Drive!
Modelo movido para a GPU.
